home *** CD-ROM | disk | FTP | other *** search
/ Practical Internet 2002 February / Practical Internet February 2002.iso / pc / Software / Browsing / httrack-3.09e2.exe / {app} / src / htsparse.c < prev    next >
Encoding:
C/C++ Source or Header  |  2001-11-07  |  101.9 KB  |  2,233 lines

  1. /* ------------------------------------------------------------ */
  2. /*
  3. HTTrack Website Copier, Offline Browser for Windows and Unix
  4. Copyright (C) Xavier Roche and other contributors
  5.  
  6. This program is free software; you can redistribute it and/or
  7. modify it under the terms of the GNU General Public License
  8. as published by the Free Software Foundation; either version 2
  9. of the License, or any later version.
  10.  
  11. This program is distributed in the hope that it will be useful,
  12. but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14. GNU General Public License for more details.
  15.  
  16. You should have received a copy of the GNU General Public License
  17. along with this program; if not, write to the Free Software
  18. Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
  19.  
  20.  
  21. Important notes:
  22.  
  23. - We hereby ask people using this source NOT to use it in purpose of grabbing
  24. emails addresses, or collecting any other private information on persons.
  25. This would disgrace our work, and spoil the many hours we spent on it.
  26.  
  27.  
  28. Please visit our Website: http://www.httrack.com
  29. */
  30.  
  31.  
  32. /* ------------------------------------------------------------ */
  33. /* File: Main source                                            */
  34. /* DIRECT INCLUDE TO httrack.c                                  */
  35. /* Author: Xavier Roche                                         */
  36. /* ------------------------------------------------------------ */
  37.  
  38.  
  39. #if HTS_ANALYSTE
  40. if (hts_htmlcheck(r.adr,(int)r.size,urladr,urlfil)) {
  41. #endif          
  42.   FILE* fp=NULL;      // fichier Θcrit localement                                               // et si level>0
  43.   char* adr=r.adr;    // pointeur (on parcourt)
  44.   char* lastsaved;    // adresse du dernier octet sauvΘ + 1
  45.   if ( (opt.debug>1) && (opt.log!=NULL) ) {
  46.     fspc(opt.log,"debug"); fprintf(opt.log,"scan file.."LF); test_flush;
  47.   }
  48.  
  49.  
  50.   // Indexing!
  51. #if HTS_MAKE_KEYWORD_INDEX
  52.   if (opt.kindex) {
  53.     if (index_keyword(r.adr,r.size,r.contenttype,savename,opt.path_html)) {
  54.       if ( (opt.debug>1) && (opt.log!=NULL) ) {
  55.         fspc(opt.log,"debug"); fprintf(opt.log,"indexing file..done"LF); test_flush;
  56.       }
  57.     } else {
  58.       if ( (opt.debug>1) && (opt.log!=NULL) ) {
  59.         fspc(opt.log,"debug"); fprintf(opt.log,"indexing file..error!"LF); test_flush;
  60.       }
  61.     }
  62.   }
  63. #endif
  64.  
  65.   // Now, parsing
  66.   if ((opt.getmode & 1) && (ptr>0)) {  // rΘcupΘrer les html sur disque       
  67.     // crΘer le fichier html local
  68.     HT_ADD_FOP;   // Θcrire peu α peu le fichier
  69.   }
  70.   
  71.   if (!error) {
  72.     int detect_title=0;  // dΘtection  du title
  73.     //
  74.     char* in_media=NULL; // in other media type (real media and so..)
  75.     int intag=0;         // on est dans un tag
  76.     int incomment=0;     // dans un <!--
  77.     int inscript=0;      // dans un scipt pour applets javascript)
  78.     int inscript_tag=0;  // on est dans un <body onLoad="... terminΘ par >
  79.     char inscript_tag_lastc='\0';  
  80.                            // terminaison (" ou ') du "<body onLoad=.."
  81.     int inscriptgen=0;     // on est dans un code gΘnΘrant, ex aprΦs obj.write("..
  82.     char scriptgen_q='\0'; // caractΦre faisant office de guillemet (' ou ")
  83.     int nofollow=0;        // ne pas scanner
  84.     //
  85.     int parseall_lastc='\0';    // dernier caractΦre parsΘ pour parseall
  86.     int parseall_incomment=0;   // dans un /* */ (exemple: a = /* URL */ "img.gif";)
  87.     //
  88.     char* intag_start=adr;
  89.     char* intag_startattr=NULL;
  90.     int intag_start_valid=0;
  91.     HT_ADD_START;    // dΘbuter
  92.  
  93.  
  94.     /* statistics */
  95.     if ((opt.getmode & 1) && (ptr>0)) { 
  96.       /*
  97.       HTS_STAT.stat_files++;
  98.       HTS_STAT.stat_bytes+=r.size;
  99.       */
  100.     }
  101.  
  102.     /* Primary list or URLs */
  103.     if (ptr == 0) {
  104.       intag=1;
  105.       intag_start_valid=0;
  106.     }
  107.     /* Check is the file is a .js file */
  108.     else if (
  109.       (strfield2(r.contenttype,"application/x-javascript")!=0)
  110.       || (strfield2(r.contenttype,"text/css")!=0)
  111.       ) {      /* JavaScript js file */
  112.       inscript=1;
  113.       intag=1;     // because aprΦs <script> on y est .. - pas utile
  114.       intag_start_valid=0;    // OUI car nous sommes dans du code, plus dans du "vrai" tag
  115.       if ((opt.debug>1) && (opt.log!=NULL)) {
  116.         fspc(opt.log,"debug"); fprintf(opt.log,"note: this file is a javascript file"LF); test_flush;
  117.       }
  118.     }
  119.     /* Or a real audio */
  120.     else if (strfield2(r.contenttype,"audio/x-pn-realaudio")!=0) {      /* realaudio link file */
  121.       inscript=intag=1;
  122.       intag_start_valid=0;
  123.       in_media="RAM";       // real media!
  124.     }
  125.     // Hack to prevent any problems with ram files of other files
  126.     * ( r.adr + r.size ) = '\0';
  127.  
  128.  
  129.     // ------------------------------------------------------------
  130.     // analyser ce qu'il y a en mΘmoire (fichier html)
  131.     // on scanne les balises
  132.     // ------------------------------------------------------------
  133. #if HTS_ANALYSTE
  134.     _hts_in_html_done=0;     // 0% scannΘs
  135.     _hts_cancel=0;           // pas de cancel
  136.     _hts_in_html_parsing=1;  // flag pour indiquer un parsing
  137. #endif
  138.     base[0]='\0';    // effacer base-href
  139.     lastsaved=adr;
  140.     do {
  141.       int p=0;
  142.       int valid_p=0;      // force to take p even if == 0
  143.       int ending_p='\0';  // ending quote?
  144.       error=0;
  145.  
  146.       /* Hack to avoid NULL char problems with C syntax */
  147.       /* Yes, some bogus HTML pages can embed null chars
  148.          and therefore can not be properly handled if this hack is not done
  149.       */
  150.       if ( ! (*adr) ) {
  151.         if (( ((int) adr) - ((int) r.adr) ) < r.size)
  152.           *adr=' ';
  153.       }
  154.  
  155.  
  156.  
  157.       /*
  158.       index.html built here
  159.       */
  160.       // Construction index.html (sommaire)
  161.       // Avant de tester les a href,
  162.       // Ici on teste si l'on doit construire l'index vers le(s) site(s) miroir(s)
  163.       if (!makeindex_done) {  // autoriation d'Θcrire un index
  164.         if (!detect_title) {
  165.           if (opt.depth == liens[ptr]->depth) {    // on note toujours les premiers liens
  166.             if (!in_media) {
  167.               if (opt.makeindex && (ptr>0)) {
  168.                 if (opt.getmode & 1) {  // autorisation d'Θcrire
  169.                   p=strfield(adr,"title");  
  170.                   if (p) {
  171.                     if (*(adr-1)=='/') p=0;    // /title
  172.                   } else {
  173.                     if (strfield(adr,"/html"))
  174.                       p=-1;                    // noter, mais sans titre
  175.                     else if (strfield(adr,"body"))
  176.                       p=-1;                    // noter, mais sans titre
  177.                     else if (( ((int) adr) - ((int) r.adr) ) >= (r.size-1) )
  178.                       p=-1;                    // noter, mais sans titre
  179.                   }
  180.                 } else
  181.                   p=0;
  182.                 
  183.                 if (p) {    // ok center                            
  184.                   if (makeindex_fp==NULL) {
  185.                     verif_backblue(opt.path_html);    // gΘnΘrer gif
  186.                     makeindex_fp=filecreate(fconcat(opt.path_html,"index.html"));
  187.                     if (makeindex_fp!=NULL) {
  188.  
  189.                       // Header
  190.                       fprintf(makeindex_fp,template_header,
  191.                         "<!-- Mirror and index made by HTTrack Website Copier/"HTTRACK_VERSION" "HTTRACK_AFF_AUTHORS" -->"
  192.                         );
  193.  
  194.                     } else makeindex_done=-1;    // fait, erreur
  195.                   }
  196.                   
  197.                   if (makeindex_fp!=NULL) {
  198.                     char tempo[HTS_URLMAXSIZE*2];
  199.                     char s[HTS_URLMAXSIZE*2];
  200.                     char* a=NULL;
  201.                     char* b=NULL;
  202.                     s[0]='\0';
  203.                     if (p>0) {
  204.                       a=strchr(adr,'>');
  205.                       if (a!=NULL) {
  206.                         a++;
  207.                         while(is_space(*a)) a++;    // sauter espaces & co
  208.                         b=strchr(a,'<');   // prochain tag
  209.                       }
  210.                     }
  211.                     if (lienrelatif(tempo,liens[ptr]->sav,concat(opt.path_html,"index.html"))==0) {
  212.                       detect_title=1;      // ok dΘtectΘ pour cette page!
  213.                       makeindex_links++;   // un de plus
  214.                       strcpy(makeindex_firstlink,tempo);
  215.                       //
  216.                       if ((b==a) || (a==NULL) || (b==NULL)) {    // pas de titre
  217.                         strcpy(s,tempo);
  218.                       } else if ((b-a)<256) {
  219.                         b--;
  220.                         while(is_space(*b)) b--;
  221.                         strncpy(s,a,b-a+1);
  222.                         *(s+(b-a)+1)='\0';
  223.                       }
  224.  
  225.                       // Body
  226.                       fprintf(makeindex_fp,template_body,
  227.                         tempo,
  228.                         s
  229.                         );
  230.  
  231.                     }
  232.                   }
  233.                 }
  234.               }
  235.             }
  236.             
  237.           } else if (liens[ptr]->depth<opt.depth) {   // on a sautΘ level1+1 et level1
  238.             if (makeindex_fp) {
  239.               char tempo[1024];
  240.               if (makeindex_links == 1) {
  241.                 sprintf(tempo,"<meta HTTP-EQUIV=\"Refresh\" CONTENT=\"0; URL=%s\">"CRLF,makeindex_firstlink);
  242.               } else
  243.                 tempo[0]='\0';
  244.  
  245.               // Footer
  246.               fprintf(makeindex_fp,template_footer,
  247.                 "<!-- Mirror and index made by HTTrack Website Copier/"HTTRACK_VERSION" "HTTRACK_AFF_AUTHORS" -->",
  248.                 tempo
  249.                 );
  250.  
  251.               fflush(makeindex_fp);
  252.               fclose(makeindex_fp);  // α ne pas oublier sinon on passe une nuit blanche
  253.               makeindex_fp=NULL;
  254.               usercommand(0,NULL,fconcat(opt.path_html,"index.html"));                            
  255.             }
  256.             makeindex_done=1;    // ok c'est fait
  257.           }
  258.         } // if (opt.makeindex)
  259.       }
  260.       // FIN Construction index.html (sommaire)
  261.       /*
  262.       end -- index.html built here
  263.       */
  264.       
  265.  
  266.  
  267.       /* Parse */
  268.       if (
  269.            (*adr=='<')    /* No starting tag */
  270.         && (!inscript)    /* Not in (java)script */
  271.         && (!incomment)   /* Not in comment (<!--) */
  272.       ) { 
  273.         intag=1;
  274.         parseall_incomment=0;
  275.         //inquote=0;  // effacer quote
  276.         intag_start=adr; intag_start_valid=1;
  277.         codebase[0]='\0';    // effacer Θventuel codebase
  278.         
  279.         if (opt.getmode & 1) {  // sauver html
  280.           p=strfield(adr,"</html");
  281.           if (p==0) p=strfield(adr,"<head>");
  282.           if (p) {
  283.             if (strnotempty(opt.footer)) {
  284.               char tempo[1024+HTS_URLMAXSIZE*2];
  285.               char gmttime[256];
  286.               char* eol="\n";
  287.               tempo[0]='\0';
  288.               if (strchr(r.adr,'\r'))
  289.                 eol="\r\n";
  290.               time_gmt_rfc822(gmttime);
  291.               strcat(tempo,eol);
  292.               sprintf(tempo+strlen(tempo),opt.footer,jump_identification(urladr),urlfil,gmttime,"","","","","","","","");
  293.               strcat(tempo,eol);
  294.               //fwrite(tempo,1,strlen(tempo),fp);
  295.               HT_ADD(tempo);
  296.             }
  297.           }
  298.         }        
  299.         
  300.         // Θliminer les <!-- (commentaires) : intag dΘvalidΘ
  301.         if (*(adr+1)=='!')
  302.           if (*(adr+2)=='-')
  303.             if (*(adr+3)=='-') {
  304.               intag=0;
  305.               incomment=1;
  306.               intag_start_valid=0;
  307.             }
  308.             
  309.       }
  310.       else if (
  311.            (*adr=='>')                        /* ending tag */
  312.         && ( (!inscript) || (inscript_tag) )  /* and in tag (or in script) */
  313.       ) {
  314.         if (inscript_tag) {
  315.           inscript_tag=inscript=0;
  316.           intag=0;
  317.           incomment=0;
  318.           intag_start_valid=0;
  319.         } else if (!incomment) {
  320.           intag=0; //inquote=0;
  321.           
  322.           // entrΘe dans du javascript?
  323.           // on parse ICI car il se peut qu'on ait eu a parser les src=.. dedans
  324.           //if (!inscript) {  // sinon on est dans un obj.write("..
  325.           if ((intag_start_valid) && 
  326.             (
  327.             check_tag(intag_start,"script")
  328.             ||
  329.             check_tag(intag_start,"style")
  330.             )
  331.             ) {
  332.             char* a=intag_start;    // <
  333.             // ** while(is_realspace(*(--a)));
  334.             if (*a=='<') {  // s√r que c'est un tag?
  335.               inscript=1;
  336.               intag=1;     // because aprΦs <script> on y est .. - pas utile
  337.               intag_start_valid=0;    // OUI car nous sommes dans du code, plus dans du "vrai" tag
  338.             }
  339.           }
  340.         } else {                               /* end of comment? */
  341.           // vΘrifier fermeture correcte
  342.           if ( (*(adr-1)=='-') && (*(adr-2)=='-') ) {
  343.             intag=0;
  344.             incomment=0;
  345.             intag_start_valid=0;
  346.           }
  347. #if GT_ENDS_COMMENT
  348.           /* wrong comment ending */
  349.           else {
  350.             /* check if correct ending do not exist 
  351.                <!-- foo > example <!-- bar > is sometimes accepted by browsers
  352.                when no --> is used somewhere else.. darn those browsers are dirty
  353.             */
  354.             if (!strstr(adr,"-->")) {
  355.               intag=0;
  356.               incomment=0;
  357.               intag_start_valid=0;
  358.             }
  359.           }
  360. #endif
  361.         }
  362.         //}
  363.       }
  364.       //else if (*adr==34) {
  365.       //  inquote=(inquote?0:1);
  366.       //}
  367.       else if (intag || inscript) {    // nous sommes dans un tag/commentaire, tester si on recoit un tag
  368.         int p_type=0;
  369.         int p_nocatch=0;
  370.         int p_searchMETAURL=0;  // chercher ..URL=<url>
  371.         int add_class=0;        // ajouter .class
  372.         int add_class_dots_to_patch=0;   // number of '.' in code="x.y.z<realname>"
  373.         char* p_flush=NULL;
  374.         
  375.         
  376.         // ------------------------------------------------------------
  377.         // parsing ΘvolΘ
  378.         // ------------------------------------------------------------
  379.         if (((isalpha((unsigned char)*adr)) || (*adr=='/') || (inscript) || (inscriptgen))) {  // sinon pas la peine de tester..
  380.  
  381.  
  382.           /* caractΦre de terminaison pour "miniparsing" javascript=.. ? 
  383.              (ex: <a href="javascript:()" action="foo"> ) */
  384.           if (inscript_tag) {
  385.             if (inscript_tag_lastc) {
  386.               if (*adr == inscript_tag_lastc) {
  387.                 /* sortir */
  388.                 inscript_tag=inscript=0;
  389.                 incomment=0;
  390.               }
  391.             }
  392.           }
  393.           
  394.           
  395.           // Note:
  396.           // Certaines pages ne respectent pas le html
  397.           // notamment les guillements ne sont pas fixΘs
  398.           // Nous sommes dans un tag, donc on peut faire un test plus
  399.           // large pour pouvoi prendre en compte ces particularitΘs
  400.           
  401.           // α vΘrifier: ACTION, CODEBASE, VRML
  402.           
  403.           if (in_media) {
  404.             if (strcmp(in_media,"RAM")==0) { // real media
  405.               p=0;
  406.               valid_p=1;
  407.             }
  408.           } else if (ptr>0) {        /* pas premiΦre page 0 (primary) */
  409.             p=0;  // saut pour le nom de fichier: adresse nom fichier=adr+p
  410.             
  411.             // ------------------------------
  412.             // dΘtection d'Θcriture JavaScript.
  413.             // osons les obj.write et les obj.href=.. ! osons!
  414.             // note: inscript==1 donc on sautera aprΦs les \"
  415.             if (inscript) {
  416.               if (inscriptgen) {          // on est dΘja dans un objet gΘnΘrant..
  417.                 if (*adr==scriptgen_q) {  // fermeture des " ou '
  418.                   if (*(adr-1)!='\\') {   // non
  419.                     inscriptgen=0;        // ok parsing terminΘ
  420.                   }
  421.                 }
  422.               } else {
  423.                 char* a=NULL;
  424.                 char check_this_fking_line=0;  // parsing code javascript..
  425.                 char must_be_terminated=0;     // caractΦre obligatoire de terminaison!
  426.                 int token_size;
  427.                 if (!(token_size=strfield(adr,".writeln"))) // dΘtection ...objet.write[ln]("code html")...
  428.                   token_size=strfield(adr,".write");
  429.                 if (token_size) {
  430.                   a=adr+token_size;
  431.                   while(is_realspace(*a)) a++; // sauter espaces
  432.                   if (*a=='(') {  // dΘbut parenthΦse
  433.                     check_this_fking_line=2;  // α parser!
  434.                     must_be_terminated=')';
  435.                     a++;  // sauter (
  436.                   }
  437.                 }
  438.                 // euhh ??? ???
  439.                 /* else if (strfield(adr,".href")) {  // dΘtection ...objet.href="...
  440.                 a=adr+5;
  441.                 while(is_realspace(*a)) a++; // sauter espaces
  442.                 if (*a=='=') {  // ohh un Θgal
  443.                 check_this_fking_line=1;  // α noter!
  444.                 must_be_terminated=';';   // et si t'as oubliΘ le ; tu sais pas coder
  445.                 a++;   // sauter =
  446.                 }
  447.                 
  448.                 }*/
  449.                 
  450.                 // on a un truc du genre instruction"code gΘnΘrΘ" dont on parse le code
  451.                 if (check_this_fking_line) {
  452.                   while(is_realspace(*a)) a++;
  453.                   if ((*a=='\'') || (*a=='"')) {  // dΘpart de '' ou ""
  454.                     char *b;
  455.                     int ex=0;
  456.                     scriptgen_q=*a;    // quote
  457.                     b=a+1;      // dΘpart de la chaεne
  458.                     // vΘrifier forme ("code") et pas ("code"+var), ingΘrable
  459.                     do {
  460.                       a++;  // caractΦre suivant
  461.                       if (*a==scriptgen_q) if (*(a-1)!='\\')  // quote non slash
  462.                         ex=1;            // sortie
  463.                       if ((*a==10) || (*a==13))
  464.                         ex=1;
  465.                     } while(!ex);
  466.                     if (*a==scriptgen_q) {  // fin du quote
  467.                       a++;
  468.                       while(is_realspace(*a)) a++;
  469.                       if (*a==must_be_terminated) {  // parenthΦse fermante: ("..")
  470.                         
  471.                         // bon, on doit parser une ligne javascript
  472.                         // 1) si check.. ==1 alors c'est un nom de fichier direct, donc
  473.                         // on fixe p sur le saut nΘcessaire pour atteindre le nom du fichier
  474.                         // et le moteur se dΘbrouillera ensuite tout seul comme un grand
  475.                         // 2) si check==2 c'est un peu plus tordu car lα on gΘnΘre du
  476.                         // code html au sein de code javascript au sein de code html
  477.                         // dans ce cas on doit fixer un flag α un puis ensuite dans la boucle
  478.                         // on devra parser les instructions standard comme <a href etc
  479.                         // NOTE: le code javascript autogΘnΘrΘ n'est pas pris en compte!!
  480.                         // (et ne marche pas dans 50% des cas de toute facon!)
  481.                         if (check_this_fking_line==1) {
  482.                           p=(int) b-(int) adr;  // calculer saut!
  483.                         } else {
  484.                           inscriptgen=1;        // SCRIPTGEN actif
  485.                           adr=b;                // jump
  486.                         }
  487.                         
  488.                         if ((opt.debug>1) && (opt.log!=NULL)) {
  489.                           char str[512];
  490.                           str[0]='\0';
  491.                           strncat(str,b,minimum((int) a-(int) b+1,32));
  492.                           fspc(opt.log,"debug"); fprintf(opt.log,"active code (%s) detected in javascript: %s"LF,(check_this_fking_line==2)?"parse":"pickup",str); test_flush;
  493.                         }
  494.                       }
  495.                       
  496.                     }
  497.                     
  498.                   }
  499.                   
  500.                   
  501.                 }
  502.               }
  503.             }
  504.             // fin detection code gΘnΘrant javascript vers html
  505.             // ------------------------------
  506.             
  507.             
  508.             // analyse proprement dite, A HREF=.. etc..
  509.             if (!p) {
  510.               // si dans un tag, et pas dans un script - sauf si on analyse un obj.write("..
  511.               if ((intag && (!inscript)) || inscriptgen) {
  512.                 if ( (*(adr-1)=='<') || (is_space(*(adr-1))) ) {   // <tag < tag etc
  513.                   // <A HREF=.. pour les liens HTML
  514.                   p=rech_tageq(adr,"href");
  515.                   if (p) {    // href.. tester si c'est une bas href!
  516.                     if ((intag_start_valid) && check_tag(intag_start,"base")) {  // oui!
  517.                       // ** note: base href et codebase ne font pas bon mΘnage..
  518.                       p_type=2;    // c'est un chemin
  519.                     }
  520.                   }
  521.                   
  522.                   /* Tags supplΘmentaires α vΘrifier (<img src=..> etc) */
  523.                   if (p==0) {
  524.                     int i=0;
  525.                     while( (p==0) && (strnotempty(hts_detect[i])) ) {
  526.                       p=rech_tageq(adr,hts_detect[i]);
  527.                       i++;
  528.                     }
  529.                   }
  530.  
  531.                   /* Tags supplΘmentaires en dΘbut α vΘrifier (<object .. hotspot1=..> etc) */
  532.                   if (p==0) {
  533.                     int i=0;
  534.                     while( (p==0) && (strnotempty(hts_detectbeg[i])) ) {
  535.                       p=rech_tageqbegdigits(adr,hts_detectbeg[i]);
  536.                       i++;
  537.                     }
  538.                   }
  539.                   
  540.                   /* Tags supplΘmentaires α vΘrifier : URL=.. */
  541.                   if (p==0) {
  542.                     int i=0;
  543.                     while( (p==0) && (strnotempty(hts_detectURL[i])) ) {
  544.                       p=rech_tageq(adr,hts_detectURL[i]);
  545.                       i++;
  546.                     }
  547.                     if (p)
  548.                       p_searchMETAURL=1;
  549.                   }
  550.                   
  551.                   /* Tags supplΘmentaires α vΘrifier, mais α ne pas capturer */
  552.                   if (p==0) {
  553.                     int i=0;
  554.                     while( (p==0) && (strnotempty(hts_detectandleave[i])) ) {
  555.                       p=rech_tageq(adr,hts_detectandleave[i]);
  556.                       i++;
  557.                     }
  558.                     if (p)
  559.                       p_nocatch=1;      /* ne pas rechercher */
  560.                   }
  561.                   
  562.                   /* EvΘnements */
  563.                   if (p==0) {
  564.                     int i=0;
  565.                     /* dΘtection onLoad etc */
  566.                     while( (p==0) && (strnotempty(hts_detect_js[i])) ) {
  567.                       p=rech_tageq(adr,hts_detect_js[i]);
  568.                       i++;
  569.                     }
  570.                     /* non dΘtectΘ - dΘtecter Θgalement les onXxxxx= */
  571.                     if (p==0) {
  572.                       if ( (*adr=='o') && (*(adr+1)=='n') && isUpperLetter(*(adr+2)) ) {
  573.                         p=0;
  574.                         while(isalpha((unsigned char)adr[p]) && (p<64) ) p++;
  575.                         if (p<64) {
  576.                           while(is_space(adr[p])) p++;
  577.                           if (adr[p]=='=')
  578.                             p++;
  579.                           else p=0;
  580.                         } else p=0;
  581.                       }
  582.                     }
  583.                     /* OK, ΘvΘnement repΘrΘ */
  584.                     if (p) {
  585.                       inscript_tag_lastc=*(adr+p);     /* α attendre α la fin */
  586.                       adr+=p;     /* saut */
  587.                                   /*
  588.                                   On est dΘsormais dans du code javascript
  589.                       */
  590.                       inscript_tag=inscript=1;
  591.                     }
  592.                     p=0;        /* quoi qu'il arrive, ne rien dΘmarrer ici */
  593.                   }
  594.                   
  595.                   // <APPLET CODE=.. pour les applet java.. [CODEBASE (chemin..) α faire]
  596.                   if (p==0) {
  597.                     p=rech_tageq(adr,"code");
  598.                     if (p) {
  599.                       if ((intag_start_valid) && check_tag(intag_start,"applet")) {  // dans un <applet !
  600.                         p_type=-1;  // juste le nom de fichier+dossier, Θcire avant codebase 
  601.                         add_class=1;   // ajouter .class au besoin                         
  602.                         
  603.                         // vΘrifier qu'il n'y a pas de codebase APRES
  604.                         // sinon on swappe les deux.
  605.                         // pas trΦs propre mais c'est ce qu'il y a de plus simple α faire!!
  606.                         
  607.                         {
  608.                           char *a;
  609.                           a=adr;
  610.                           while((*a) && (*a!='>') && (!rech_tageq(a,"codebase"))) a++;
  611.                           if (rech_tageq(a,"codebase")) {  // banzai! codebase=
  612.                             char* b;
  613.                             b=strchr(a,'>');
  614.                             if (b) {
  615.                               if (((int) b - (int) adr) < 1000) {    // au total < 1Ko
  616.                                 char tempo[HTS_URLMAXSIZE*2];
  617.                                 tempo[0]='\0';
  618.                                 strncat(tempo,a,(int) b - (int) a);
  619.                                 strcat( tempo," ");
  620.                                 strncat(tempo,adr,(int) a - (int) adr - 1);
  621.                                 // Θventuellement remplire par des espaces pour avoir juste la taille
  622.                                 while((int) strlen(tempo)<((int) b - (int) adr))
  623.                                   strcat(tempo," ");
  624.                                 // pas d'erreur?
  625.                                 if ((int) strlen(tempo) == ((int) b - (int) adr)) {
  626.                                   strncpy(adr,tempo,strlen(tempo));   // PAS d'octet nul α la fin!
  627.                                   p=0;    // DEVALIDER!!
  628.                                   p_type=0;
  629.                                   add_class=0;
  630.                                 }
  631.                               }
  632.                             }
  633.                           }
  634.                         }
  635.                         
  636.                       }
  637.                     }
  638.                   }
  639.                   
  640.                   // liens α patcher mais pas α charger (ex: codebase)
  641.                   if (p==0) {  // note: si non chargΘ (ex: ignorer .class) patchΘ tout de mΩme
  642.                     p=rech_tageq(adr,"codebase");
  643.                     if (p) {
  644.                       if ((intag_start_valid) && check_tag(intag_start,"applet")) {  // dans un <applet !
  645.                         p_type=-2;
  646.                       } else p=-1;   // ne plus chercher
  647.                     }
  648.                   }
  649.                   
  650.                   
  651.                   // Meta tags pour robots
  652.                   if (p==0) {
  653.                     if (opt.robots) {
  654.                       if ((intag_start_valid) && check_tag(intag_start,"meta")) {
  655.                         if (rech_tageq(adr,"name")) {    // name=robots.txt
  656.                           char tempo[1100];
  657.                           char* a;
  658.                           tempo[0]='\0';
  659.                           a=strchr(adr,'>');
  660. #if DEBUG_ROBOTS
  661.                           printf("robots.txt meta tag detected\n");
  662. #endif
  663.                           if (a) {
  664.                             if (((int) a - (int) adr) < 999 ) {
  665.                               strncat(tempo,adr,(int) a - (int) adr);
  666.                               if (strstrcase(tempo,"content")) {
  667.                                 if (strstrcase(tempo,"robots")) {
  668.                                   if (strstrcase(tempo,"nofollow")) {
  669. #if DEBUG_ROBOTS
  670.                                     printf("robots.txt meta tag: nofollow in %s%s\n",urladr,urlfil);
  671. #endif
  672.                                     nofollow=1;       // NE PLUS suivre liens dans cette page
  673.                                     if (opt.errlog) {
  674.                                       fspc(opt.errlog,"warning"); fprintf(opt.errlog,"Link %s%s not scanned (follow robots meta tag)"LF,urladr,urlfil);
  675.                                       test_flush;
  676.                                     }
  677.                                   }
  678.                                 }
  679.                               }
  680.                             }
  681.                           }
  682.                         }
  683.                       }
  684.                     }
  685.                   }
  686.                   
  687.                   // entrΘe dans une applet javascript
  688.                   /*if (!inscript) {  // sinon on est dans un obj.write("..
  689.                   if (p==0)
  690.                   if (rech_sampletag(adr,"script"))
  691.                   if (check_tag(intag_start,"script")) {
  692.                   inscript=1;
  693.                   }
  694.                         }*/
  695.                   
  696.                   // Ici on procΦde α une analyse du code javascript pour tenter de rΘcupΘrer
  697.                   // certains fichiers Θvidents.
  698.                   // C'est devenu obligatoire vu le nombre de pages qui intΦgrent
  699.                   // des images rΘactives par exemple
  700.                 }
  701.               } else if (inscript) {
  702.                 if (
  703.                   (
  704.                   (strfield(adr,"/script"))
  705.                   ||
  706.                   (strfield(adr,"/style"))
  707.                   )
  708.                   ) {
  709.                   char* a=adr;
  710.                   //while(is_realspace(*(--a)));
  711.                   while( is_realspace(*a) ) a--;
  712.                   a--;
  713.                   if (*a=='<') {  // s√r que c'est un tag?
  714.                     inscript=0;
  715.                   }
  716.                 } else {
  717.                   int nc;
  718.                   char  expected     = '=';          // caractΦre attendu aprΦs
  719.                   char* expected_end = ";";
  720.                   int can_avoid_quotes=0;
  721.                   char quotes_replacement='\0';
  722.                   if (inscript_tag)
  723.                     expected_end=";\"\'";            // voir a href="javascript:doc.location='foo'"
  724.                   nc = strfield(adr,".src");  // nom.src="image";
  725.                   if (!nc) nc = strfield(adr,".location");  // document.location="doc"
  726.                   if (!nc) nc = strfield(adr,".href");  // document.location="doc"
  727.                   if (!nc) if ( (nc = strfield(adr,".open")) ) { // window.open("doc",..
  728.                     expected='(';    // parenthΦse
  729.                     expected_end="),";  // fin: virgule ou parenthΦse
  730.                   }
  731.                   if (!nc) if ( (nc = strfield(adr,".replace")) ) { // window.replace("url")
  732.                     expected='(';    // parenthΦse
  733.                     expected_end=")";  // fin: parenthΦse
  734.                   }
  735.                   if (!nc) if ( (nc = strfield(adr,".link")) ) { // window.link("url")
  736.                     expected='(';    // parenthΦse
  737.                     expected_end=")";  // fin: parenthΦse
  738.                   }
  739.                   if (!nc) if ( (nc = strfield(adr,"url")) ) { // url(url)
  740.                     expected='(';    // parenthΦse
  741.                     expected_end=")";  // fin: parenthΦse
  742.                     can_avoid_quotes=1;
  743.                     quotes_replacement=')';
  744.                   }
  745.                   if (nc) {
  746.                     char *a;
  747.                     a=adr+nc;
  748.                     while(is_space(*a)) a++;
  749.                     if (*a == expected) {
  750.                       a++;
  751.                       while(is_realspace(*a)) a++;
  752.                       if ((*a==34) || (*a=='\'') || (can_avoid_quotes)) {
  753.                         char *b,*c;
  754.                         int ndelim=1;
  755.                         if ((*a==34) || (*a=='\''))
  756.                           a++;
  757.                         else
  758.                           ndelim=0;
  759.                         b=a;
  760.                         if (ndelim) {
  761.                           while((*b!=34) && (*b!='\'') && (*b!='\0')) b++;
  762.                         }
  763.                         else {
  764.                           while((*b != quotes_replacement) && (*b!='\0')) b++;
  765.                         }
  766.                         c=b--; c+=ndelim;
  767.                         while(*c==' ') c++;
  768.                         if ((strchr(expected_end,*c)) || (*c=='\n') || (*c=='\r')) {
  769.                           c-=(ndelim+1);
  770.                           if ((int) c-(int) a+1) {
  771.                             if ((opt.debug>1) && (opt.log!=NULL)) {
  772.                               char str[512];
  773.                               str[0]='\0';
  774.                               strncat(str,a,minimum((int) c-(int) a+1,32));
  775.                               fspc(opt.log,"debug"); fprintf(opt.log,"link detected in javascript: %s"LF,str); test_flush;
  776.                             }
  777.                             p=(int) a- (int) adr;    // p non nul: TRAITER CHAINE COMME FICHIER
  778.                             if (can_avoid_quotes) {
  779.                               ending_p=quotes_replacement;
  780.                             }
  781.                           }
  782.                         }
  783.                         
  784.                         
  785.                       }
  786.                     }
  787.                   }
  788.                   
  789.                 }
  790.               }
  791.             }
  792.             
  793.           } else {      // ptr == 0
  794.             //p=rech_tageq(adr,"primary");    // lien primaire, yeah
  795.             p=0;          // No stupid tag anymore, raw link
  796.             valid_p=1;    // Valid even if p==0
  797.             while ((adr[p] == '\r') || (adr[p] == '\n'))
  798.               p++;
  799.             //can_avoid_quotes=1;
  800.             ending_p='\r';
  801.           }       
  802.           
  803.         } else if (isspace((unsigned char)*adr)) {
  804.           intag_startattr=adr+1;        // attribute in tag (for dirty parsing)
  805.         }
  806.           
  807.           
  808.           // ------------------------------------------------------------
  809.           // dernier recours - parsing "sale" : dΘtection systΘmatique des .gif, etc.
  810.           // risque: gΘnΘrer de faux fichiers parazites
  811.           // fix: ne parse plus dans les commentaires
  812.           // ------------------------------------------------------------
  813.           if ( (opt.parseall) && (ptr>0) && (!in_media) ) {           // option parsing "brut"
  814.             int incomment_justquit=0;
  815.             if (!is_realspace(*adr)) {
  816.               int noparse=0;
  817.  
  818.               // Gestion des /* */
  819.               if (inscript) {
  820.                 if (parseall_incomment) {
  821.                   if ((*adr=='/') && (*(adr-1)=='*'))
  822.                     parseall_incomment=0;
  823.                   incomment_justquit=1;       // ne pas noter dernier caractΦre
  824.                 } else {
  825.                   if ((*adr=='/') && (*(adr+1)=='*'))
  826.                     parseall_incomment=1;
  827.                 }
  828.               } else
  829.                 parseall_incomment=0;
  830.  
  831.               /* vΘrifier que l'on est pas dans un <!-- --> pur */
  832.               if ( (!intag) && (incomment) && (!inscript))
  833.                 noparse=1;        /* commentaire */
  834.  
  835.               // recherche d'URLs
  836.               if ((!parseall_incomment) && (!noparse)) {
  837.                 if (!p) {                   // non dΘja trouvΘ
  838.                   if (adr != r.adr) {     // >1 caractΦre
  839.                     // scanner les chaines
  840.                     if ((*adr == '\"') || (*adr=='\'')) {         // "xx.gif" 'xx.gif'
  841.                       if (strchr("=(,",parseall_lastc)) {    // exemple: a="img.gif..
  842.                         char *a=adr;
  843.                         char stop=*adr;  // " ou '
  844.                         int count=0;
  845.                         
  846.                         // sauter caractΦres
  847.                         a++;
  848.                         // copier
  849.                         while((*a) && (*a!='\'') && (*a!='\"') && (count<HTS_URLMAXSIZE)) { count++; a++; }
  850.                         
  851.                         // ok chaine terminΘe par " ou '
  852.                         if ((*a == stop) && (count<HTS_URLMAXSIZE) && (count>0)) {
  853.                           char c;
  854.                           char* aend;
  855.                           //
  856.                           aend=a;     // sauver dΘbut
  857.                           a++;
  858.                           while(is_taborspace(*a)) a++;
  859.                           c=*a;
  860.                           if (strchr("),;>/+\r\n",c)) {     // exemple: ..img.gif";
  861.                             // le / est pour funct("img.gif" /* URL */);
  862.                             char tempo[HTS_URLMAXSIZE*2];
  863.                             char type[256];
  864.                             int url_ok=0;      // url valide?
  865.                             tempo[0]='\0'; type[0]='\0';
  866.                             //
  867.                             strncat(tempo,adr+1,count);
  868.                             //
  869.                             if ((!strchr(tempo,' ')) || inscript) {   // espace dedans: mΘfiance! (sauf dans code javascript)
  870.                               int invalid_url=0;
  871.                               
  872.                               // Couper au # ou ? Θventuel
  873.                               {
  874.                                 char* a=strchr(tempo,'#');
  875.                                 if (a)
  876.                                   *a='\0';
  877.                                 a=strchr(tempo,'?');
  878.                                 if (a)
  879.                                   *a='\0';
  880.                               }
  881.  
  882.                               // vΘrifier qu'il n'y a pas de caractΦres spΘciaux
  883.                               if (!strnotempty(tempo))
  884.                                 invalid_url=1;
  885.                               else if (strchr(tempo,'*')
  886.                                 || strchr(tempo,'<')
  887.                                 || strchr(tempo,'>'))
  888.                                 invalid_url=1;
  889.                               
  890.                               /* non invalide? */
  891.                               if (!invalid_url) {
  892.                                 // Un plus α la fin? Alors ne pas prendre sauf si extension ("/toto.html#"+tag)
  893.                                 if (c!='+') {    // PAS de plus α la fin
  894.                                   char* a;
  895.                                   // "Comparisons of scheme names MUST be case-insensitive" (RFC2616)                                  
  896.                                   //if ((strncmp(tempo,"http://",7)==0) || (strncmp(tempo,"ftp://",6)==0))  // ok pas de problΦme
  897.                                   if ((strfield(tempo,"http:")) || (strfield(tempo,"ftp:")))  // ok pas de problΦme
  898.                                     url_ok=1;
  899.                                   else if (tempo[strlen(tempo)-1]=='/') {        // un slash: ok..
  900.                                     if (inscript)   // sinon si pas javascript, mΘfiance (rΘpertoire style base?)
  901.                                       url_ok=1;
  902.                                   } else if ((a=strchr(tempo,'/'))) {        // un slash: ok..
  903.                                     if (inscript) {    // sinon si pas javascript, mΘfiance (style "text/css")
  904.                                       if (strchr(a+1,'/'))  // un seul / : abandon (STYLE type='text/css')
  905.                                         url_ok=1;
  906.                                     }
  907.                                   }
  908.                                 }
  909.                                 // Prendre si extension reconnue
  910.                                 if (!url_ok) {
  911.                                   get_httptype(type,tempo,0);
  912.                                   if (strnotempty(type))     // type reconnu!
  913.                                     url_ok=1;
  914.                                   else if (is_dyntype(get_ext(tempo)))  // reconnu php,cgi,asp..
  915.                                     url_ok=1;
  916.                                   // MAIS pas les foobar@aol.com !!
  917.                                   if (strchr(tempo,'@'))
  918.                                     url_ok=0;
  919.                                 }
  920.                                 //
  921.                                 // Ok, cela pourrait Ωtre une URL
  922.                                 if (url_ok) {
  923.                                   
  924.                                   // Check if not fodbidden tag (id,name..)
  925.                                   if (intag_start_valid) {
  926.                                     if (intag_start)
  927.                                       if (intag_startattr)
  928.                                         if (intag)
  929.                                           if (!inscript)
  930.                                             if (!incomment) {
  931.                                               int i=0,nop=0;
  932.                                               while( (nop==0) && (strnotempty(hts_nodetect[i])) ) {
  933.                                                 nop=rech_tageq(intag_startattr,hts_nodetect[i]);
  934.                                                 i++;
  935.                                               }
  936.                                               // Forbidden tag
  937.                                               if (nop) {
  938.                                                 url_ok=0;
  939.                                                 if ((opt.debug>1) && (opt.log!=NULL)) {
  940.                                                   fspc(opt.log,"debug"); fprintf(opt.log,"dirty parsing: bad tag avoided: %s"LF,hts_nodetect[i-1]); test_flush;
  941.                                                 }
  942.                                               }
  943.                                             }
  944.                                   }
  945.                                   
  946.                                   
  947.                                   // Accepter URL, on la traitera comme une URL normale!!
  948.                                   if (url_ok)
  949.                                     p=1;
  950.  
  951.                                 }
  952.                               }
  953.                             }
  954.                           }
  955.                         }
  956.                       }
  957.                     }
  958.                   }
  959.                 }  // p == 0
  960.                 
  961.                 // plus dans un commentaire
  962.                 if (!incomment_justquit)
  963.                   parseall_lastc=*adr;             // caractΦre avant le prochain
  964.                 
  965.               } // not in comment
  966.               
  967.             }  // if realspace
  968.           }  // if parseall
  969.           
  970.           
  971.           // ------------------------------------------------------------
  972.           // p!=0 : on a repΘrΘ un Θventuel lien
  973.           // ------------------------------------------------------------
  974.           //
  975.           if ((p>0) || (valid_p)) {    // on a repΘrΘ un lien
  976.             //int lien_valide=0;
  977.             char* eadr=NULL;          /* fin de l'URL */
  978.             char* quote_adr=NULL;     /* adresse du ? dans l'adresse */
  979.             int ok=1;
  980.             char quote='\0';
  981.  
  982.             // TEST
  983.             /*{
  984.               static int loop=0;
  985.               if ((++loop)%5000==0)
  986.                 loop=0;
  987.             }*/
  988.             
  989.             // si nofollow ou un stop a ΘtΘ dΘclenchΘ, rΘΘcrire tous les liens en externe
  990.             if ((nofollow) || (opt.state.stop))
  991.               p_nocatch=1;
  992.  
  993.             // Θcrire codebase avant, flusher avant code
  994.             if ((p_type==-1) || (p_type==-2)) {
  995.               if ((opt.getmode & 1) && (ptr>0)) {
  996.                 HT_ADD_ADR;    // refresh
  997.               }
  998.               lastsaved=adr;    // dernier Θcrit+1
  999.             }
  1000.             
  1001.             // sauter espaces
  1002.             adr+=p;
  1003.             while((is_space(*adr)) && (quote=='\0')) {
  1004.               if (!quote)
  1005.                 if ((*adr=='\"') || (*adr=='\''))
  1006.                   quote=*adr;                     // on doit attendre cela α la fin
  1007.                                                   // puis quitter
  1008.                 adr++;    // sauter les espaces, "" et cie
  1009.             }
  1010.  
  1011.             /* Stop at \n (LF) if primary links*/
  1012.             if (ptr == 0)
  1013.               quote='\n';
  1014.             /* s'arrΩter que ce soit un ' ou un " : pour document.write('<img src="foo'+a); par exemple! */
  1015.             else if (inscript)
  1016.               quote='\0';
  1017.             
  1018.             // sauter Θventuel \" ou \' javascript
  1019.             if (inscript) {    // on est dans un obj.write("..
  1020.               if (*adr=='\\') {
  1021.                 if ((*(adr+1)=='\'') || (*(adr+1)=='"')) {  // \" ou \'
  1022.                   adr+=2;    // sauter
  1023.                 }
  1024.               }
  1025.             }
  1026.             
  1027.             // sauter content="1;URL=http://..
  1028.             if (p_searchMETAURL) {
  1029.               int l=0;
  1030.               while(!strfield(adr+l,"URL=") && (l<128) ) l++;
  1031.               if (!strfield(adr,"URL="))
  1032.                 ok=-1;
  1033.               else
  1034.                 adr+=(l+4);
  1035.             }
  1036.  
  1037.             /* Θviter les javascript:document.location=.. : les parser, plut⌠t */
  1038.             if (ok!=-1) {
  1039.               if (strfield(adr,"javascript:")) {
  1040.                 ok=-1;
  1041.                 /*
  1042.                 On est dΘsormais dans du code javascript
  1043.                 */
  1044.                 inscript_tag=inscript=1;
  1045.                 inscript_tag_lastc=quote;     /* α attendre α la fin */
  1046.               }
  1047.             }
  1048.             
  1049.             if (p_type==1) {
  1050.               if (*adr=='#') {
  1051.                 adr++;           // sauter # pour usemap etc
  1052.               }
  1053.             }
  1054.             eadr=adr;
  1055.             
  1056.             // ne pas flusher aprΦs code si on doit Θcrire le codebase avant!
  1057.             if ((p_type!=-1) && (p_type!=2) && (p_type!=-2)) {
  1058.               if ((opt.getmode & 1) && (ptr>0)) {
  1059.                 HT_ADD_ADR;    // refresh
  1060.               }
  1061.               lastsaved=adr;    // dernier Θcrit+1
  1062.               // aprΦs on Θcrira soit les donnΘes initiales,
  1063.               // soir une URL/lien modifiΘ!
  1064.             } else if (p_type==-1) p_flush=adr;    // flusher jusqu'α adr ensuite
  1065.             
  1066.             if (ok!=-1) {    // continuer
  1067.               // dΘcouper le lien
  1068.               do {
  1069.                 if ((* (unsigned char*) eadr)<32) {   // caractΦre de contr⌠le (ou \0)
  1070.                   if (!is_space(*eadr))
  1071.                     ok=0; 
  1072.                 }
  1073.                 if ( ( ((int) eadr) - ((int) adr) ) > HTS_URLMAXSIZE)  // ** trop long, >HTS_URLMAXSIZE caractΦres (on prΘvoit HTS_URLMAXSIZE autres pour path)
  1074.                   ok=-1;    // ne pas traiter ce lien
  1075.                 
  1076.                 if (ok) {
  1077.                   //if (*eadr!=' ') {  
  1078.                   if (is_space(*eadr)) {   // guillemets,CR, etc
  1079.                     if ((!quote) || (*eadr==quote))     // si pas d'attente de quote spΘciale ou si quote atteinte
  1080.                       ok=0; 
  1081.                   } else if (ending_p && (*eadr==ending_p))
  1082.                     ok=0;
  1083.                   else {
  1084.                     switch(*eadr) {
  1085.                     case '>': 
  1086.                       if (!quote) {
  1087.                         if (!inscript) {
  1088.                           intag=0;    // PLUS dans un tag!
  1089.                           intag_start_valid=0;
  1090.                         }
  1091.                         ok=0;
  1092.                       }
  1093.                       break;
  1094.                       /*case '<':*/ case '#': ok=0; break;    // case '?': non!
  1095.                     case '\\': if (inscript) ok=0; break;     // \" ou \' point d'arrΩt
  1096.                     case '?': quote_adr=adr; break;           // noter position query
  1097.                     }
  1098.                   }
  1099.                   //}
  1100.                 } 
  1101.                 eadr++;
  1102.               } while(ok==1);     
  1103.               
  1104.               // Empty link detected
  1105.               if ( (((int) eadr)-((int) adr)) <= 1) {       // link empty
  1106.                 ok=-1;        // No
  1107.                 if (*adr != '#') {        // Not empty+unique #
  1108.                   if ( (((int) eadr)-((int) adr)) == 1) {       // 1=link empty with delim (end_adr-start_adr)
  1109.                     if (quote) {
  1110.                       HT_ADD("#");        // We add this for a <href="">
  1111.                     }
  1112.                   }
  1113.                 }
  1114.               }
  1115.               
  1116.             }
  1117.             
  1118.             if (ok==0) {    // tester un lien
  1119.               char lien[HTS_URLMAXSIZE*2];
  1120.               int meme_adresse=0;      // 0 par dΘfaut pour primary
  1121.               //char *copie_de_adr=adr;
  1122.               //char* p;
  1123.               
  1124.               // construire lien (dΘcoupage)
  1125.               if ( (((int) eadr)-((int) adr)-1) < HTS_URLMAXSIZE  ) {    // pas trop long?
  1126.                 strncpy(lien,adr,((int) eadr)-((int) adr)-1);
  1127.                 *(lien+  (((int) eadr)-((int) adr))-1  )='\0';
  1128.                 //printf("link: %s\n",lien);          
  1129.                 // supprimer les espaces
  1130.                 while((lien[strlen(lien)-1]==' ') && (strnotempty(lien))) lien[strlen(lien)-1]='\0';
  1131.  
  1132.                 
  1133.                 // supprimer les // en / (sauf pour http://)
  1134.                 {
  1135.                   char *a,*p,*q;
  1136.                   int done=0;
  1137.                   a=strchr(lien,':');    // http://
  1138.                   if (a) {
  1139.                     a++;
  1140.                     while(*a=='/') a++;    // position aprΦs http://
  1141.                   } else {
  1142.                     a=lien;                // dΘbut
  1143.                     while(*a=='/') a++;    // position aprΦs http://
  1144.                   }
  1145.                   q=strchr(a,'?');     // ne pas traiter aprΦs '?'
  1146.                   if (!q)
  1147.                     q=a+strlen(a)-1;
  1148.                   while(( p=strstr(a,"//")) && (!done) ) {    // remplacer // par /
  1149.                     if ((int) p>(int) q) {   // aprΦs le ? (toto.cgi?param=1//2.3)
  1150.                       done=1;    // stopper
  1151.                     } else {
  1152.                       char tempo[HTS_URLMAXSIZE*2];
  1153.                       tempo[0]='\0';
  1154.                       strncat(tempo,a,(int) p - (int) a);
  1155.                       strcat (tempo,p+1);
  1156.                       strcpy(a,tempo);    // recopier
  1157.                     }
  1158.                   }
  1159.                 }
  1160.               } else
  1161.                 lien[0]='\0';    // erreur
  1162.               
  1163.               // ------------------------------------------------------
  1164.               // Lien repΘrΘ et extrait
  1165.               if (strnotempty(lien)>0) {           // construction du lien
  1166.                 char adr[HTS_URLMAXSIZE*2],fil[HTS_URLMAXSIZE*2];          // ATTENTION adr cache le "vrai" adr
  1167.                 int forbidden_url=-1;              // lien non interdit (mais non autorisΘ..)
  1168.                 int just_test_it=0;                // mode de test des liens
  1169.                 int set_prio_to=0;                 // pour capture de page isolΘe
  1170.                 int import_done=0;                 // lien importΘ (ne pas scanner ensuite *α priori*)
  1171.                 //
  1172.                 adr[0]='\0'; fil[0]='\0';
  1173.                 //
  1174.                 // 0: autorisΘ
  1175.                 // 1: interdit (patcher tout de mΩme adresse)
  1176.                 
  1177.                 if ((opt.debug>1) && (opt.log!=NULL)) {
  1178.                   fspc(opt.log,"debug"); fprintf(opt.log,"link detected in html: %s"LF,lien); test_flush;
  1179.                 }
  1180.  
  1181.                 // external check
  1182. #if HTS_ANALYSTE
  1183.                 if (!hts_htmlcheck_linkdetected(lien)) {
  1184.                   error=1;    // erreur
  1185.                   if (opt.errlog) {
  1186.                     fspc(opt.errlog,"error"); fprintf(opt.errlog,"Link %s refused by external wrapper"LF,lien);
  1187.                     test_flush;
  1188.                   }
  1189.                 }
  1190. #endif
  1191.                 
  1192.                 // purger espaces de dΘbut, CR,LF rΘsiduels (IMG SRC="foo.<\n>gif")
  1193.                 {
  1194.                   char* a;
  1195.                   while (is_realspace(lien[0])) {
  1196.                     char tempo[HTS_URLMAXSIZE*2];
  1197.                     tempo[0]='\0';
  1198.                     strcpy(tempo,lien+1);
  1199.                     strcpy(lien,tempo);
  1200.                   }
  1201.                   while ((a=strchr(lien,'\n'))) {
  1202.                     char tempo[HTS_URLMAXSIZE*2];
  1203.                     tempo[0]='\0';
  1204.                     strncat(tempo,lien,(int) a - (int) lien);
  1205.                     strcat(tempo,a+1);
  1206.                     strcpy(lien,tempo);
  1207.                   }
  1208.                   while ((a=strchr(lien,'\r'))) {
  1209.                     char tempo[HTS_URLMAXSIZE*2];
  1210.                     tempo[0]='\0';
  1211.                     strncat(tempo,lien,(int) a - (int) lien);
  1212.                     strcat(tempo,a+1);
  1213.                     strcpy(lien,tempo);
  1214.                   }
  1215.                 }
  1216.                 
  1217.                 /* Unescape/escape %20 and other   */
  1218.                 {
  1219.                   char query[HTS_URLMAXSIZE*2];
  1220.                   char* a=strchr(lien,'?');
  1221.                   if (a) {
  1222.                     strcpy(query,a);
  1223.                     *a='\0';
  1224.                   } else
  1225.                     query[0]='\0';
  1226.                   // conversion & -> & et autres joyeusetΘs
  1227.                   unescape_amp(lien);
  1228.                   unescape_amp(query);
  1229.                   // dΘcoder l'inutile (%2E par exemple) et coder espaces
  1230.                   // XXXXXXXXXXXXXXXXX strcpy(lien,unescape_http(lien));
  1231.                   strcpy(lien,unescape_http_unharm(lien));
  1232.                   escape_spc_url(lien);
  1233.                   strcat(lien,query);     /* restore */
  1234.                 }
  1235.                 
  1236.                 // convertir les Θventuels \ en des / pour Θviter des problΦmes de reconnaissance!
  1237.                 {
  1238.                   char* a=jump_identification(lien);
  1239.                   while( (a=strchr(a,'\\')) ) *a='/';
  1240.                 }
  1241.                 
  1242.                 // supprimer le(s) ./
  1243.                 while ((lien[0]=='.') && (lien[1]=='/')) {
  1244.                   char tempo[HTS_URLMAXSIZE*2];
  1245.                   strcpy(tempo,lien+2);
  1246.                   strcpy(lien,tempo);
  1247.                 }
  1248.                 if (strnotempty(lien)==0)  // sauf si plus de nom de fichier
  1249.                   strcpy(lien,"./");
  1250.                 
  1251.                 // vΘrifie les /~machin -> /~machin/
  1252.                 // supposition dangereuse?
  1253.                 if (lien[strlen(lien)-1]!='/') {
  1254.                   char *a=lien+strlen(lien)-1;
  1255.                   // Θviter aussi index~1.html
  1256.                   while (((int) a>(int) lien) && (*a!='~') && (*a!='/') && (*a!='.')) a--;
  1257.                   if (*a=='~') {
  1258.                     strcat(lien,"/");    // ajouter slash
  1259.                   }
  1260.                 }
  1261.                 
  1262.                 // APPLET CODE="mixer.MixerApplet.class" --> APPLET CODE="mixer/MixerApplet.class"
  1263.                 // yes, this is dirty
  1264.                 // but I'm so lazzy..
  1265.                 // and besides the java "code" convention is really a pain in html code
  1266.                 if (p_type==-1) {
  1267.                   char* a=strrchr(lien,'.');
  1268.                   add_class_dots_to_patch=0;
  1269.                   if (a) {
  1270.                     char* b;
  1271.                     do {
  1272.                       b=strchr(lien,'.');
  1273.                       if ((b != a) && (b)) {
  1274.                         add_class_dots_to_patch++;
  1275.                         *b='/';
  1276.                       }
  1277.                     } while((b != a) && (b));
  1278.                   }
  1279.                 }
  1280.                 
  1281.                 // Θliminer les Θventuels :80 (port par dΘfaut!)
  1282.                 {
  1283.                   char * a;
  1284.                   a=strstr(lien,"//");    // "//" authority
  1285.                   if (a)
  1286.                     a+=2;
  1287.                   else
  1288.                     a=lien;
  1289.                   while((*a) && (*a!='/') && (*a!=':')) a++;
  1290.                   if (*a==':') {  // port
  1291.                     int port=0;
  1292.                     char* b=a+1;
  1293.                     while(isdigit((unsigned char)*b)) { port*=10; port+=(int) (*b-'0'); b++; }
  1294.                     if (port==80) {  // port 80, default
  1295.                       char tempo[HTS_URLMAXSIZE*2];
  1296.                       tempo[0]='\0';
  1297.                       strncat(tempo,lien,(int) a-(int) lien);
  1298.                       strcat(tempo,a+3);  // sauter :80
  1299.                       strcpy(lien,tempo);
  1300.                     }
  1301.                   }
  1302.                 }
  1303.                 
  1304.                 // filtrer les parazites (mailto & cie)
  1305.                 /*
  1306.                 if (strfield(lien,"mailto:")) {  // ne pas traiter
  1307.                   error=1;
  1308.                 } else if (strfield(lien,"news:")) {  // ne pas traiter
  1309.                   error=1;
  1310.                 }
  1311.                 */
  1312.                 
  1313.                 // vΘrifier que l'on ne doit pas ajouter de .class
  1314.                 if (!error) {
  1315.                   if (add_class) {
  1316.                     char *a = lien+strlen(lien)-1;
  1317.                     while(((int) a > (int) lien) && (*a!='/') && (*a!='.')) a--;
  1318.                     if (*a != '.')
  1319.                       strcat(lien,".class");    // ajouter .class
  1320.                     else if (!strfield2(a,".class"))
  1321.                       strcat(lien,".class");    // idem
  1322.                   }
  1323.                 }
  1324.                 
  1325.                 // si c'est un chemin, alors vΘrifier (toto/toto.html -> http://www/toto/)
  1326.                 if (!error) {
  1327.                   if ((opt.debug>1) && (opt.log!=NULL)) {
  1328.                     fspc(opt.log,"debug"); fprintf(opt.log,"position link check %s"LF,lien); test_flush;
  1329.                   }
  1330.                   
  1331.                   if ((p_type==2) || (p_type==-2)) {   // code ou codebase                        
  1332.                     // VΘrifier les codebase=applet (au lieu de applet/)
  1333.                     if (p_type==-2) {    // codebase
  1334.                       if (strnotempty(lien)) {
  1335.                         if (fil[strlen(lien)-1]!='/') {  // pas rΘpertoire
  1336.                           strcat(lien,"/");
  1337.                         }
  1338.                       }
  1339.                     }
  1340.                     /* only one ending / (bug on some pages) */
  1341.                     if ((int)strlen(lien)>2) {
  1342.                       while( (lien[strlen(lien)-2]=='/') && ((int)strlen(lien)>2) )    /* double // (bug) */
  1343.                         lien[strlen(lien)-1]='\0';
  1344.                     }
  1345.                     // copier nom host si besoin est
  1346.                     if (strstr(lien,"//")==NULL) {  // pas de http://
  1347.                       char adr2[HTS_URLMAXSIZE*2],fil2[HTS_URLMAXSIZE*2];  // ** euh ident_url_relatif??
  1348.                       if (ident_url_relatif(lien,urladr,urlfil,adr2,fil2)<0) {                        
  1349.                         error=1;
  1350.                       } else {
  1351.                         strcpy(lien,"http://");
  1352.                         strcat(lien,adr2);
  1353.                         if (*fil2!='/')
  1354.                           strcat(lien,"/");
  1355.                         strcat(lien,fil2);
  1356.                         {
  1357.                           char* a;
  1358.                           a=lien+strlen(lien)-1;
  1359.                           while((*a) && (*a!='/') && ((int) a> (int) lien)) a--;
  1360.                           if (*a=='/') {
  1361.                             *(a+1)='\0';
  1362.                           }
  1363.                         }
  1364.                         //char tempo[HTS_URLMAXSIZE*2];
  1365.                         //strcpy(tempo,"http://");
  1366.                         //strcat(tempo,urladr);    // host
  1367.                         //if (*lien!='/')
  1368.                         //  strcat(tempo,"/");
  1369.                         //strcat(tempo,lien);
  1370.                         //strcpy(lien,tempo);
  1371.                       }
  1372.                     }
  1373.                     
  1374.                     if (!error) {  // pas d'erreur?
  1375.                       if (p_type==2) {   // code ET PAS codebase      
  1376.                         char* a=lien+strlen(lien)-1;
  1377.                         while( ((int) a > (int) lien) && (*a) && (*a!='/')) a--;
  1378.                         if (*a=='/')     // ok on a repΘrΘ le dernier /
  1379.                           *(a+1)='\0';   // couper
  1380.                         else {
  1381.                           *lien='\0';    // Θliminer
  1382.                           error=1;   // erreur, ne pas poursuivre
  1383.                         }      
  1384.                       }
  1385.                       
  1386.                       // stocker base ou codebase?
  1387.                       switch(p_type) {
  1388.                       case 2: { 
  1389.                         //if (*lien!='/') strcat(base,"/");
  1390.                         strcpy(base,lien);
  1391.                               }
  1392.                         break;      // base
  1393.                       case -2: {
  1394.                         //if (*lien!='/') strcat(codebase,"/");
  1395.                         strcpy(codebase,lien); 
  1396.                                }
  1397.                         break;  // base
  1398.                       }
  1399.                       
  1400.                       if ((opt.debug>1) && (opt.log!=NULL)) {
  1401.                         fspc(opt.log,"debug"); fprintf(opt.log,"code/codebase link %s base %s"LF,lien,base); test_flush;
  1402.                       }
  1403.                       //printf("base code: %s - %s\n",lien,base);
  1404.                     }
  1405.                     
  1406.                   } else {
  1407.                     char* _base;
  1408.                     if (p_type==-1)   // code (applet)
  1409.                       _base=codebase;
  1410.                     else
  1411.                       _base=base;
  1412.  
  1413.                     
  1414.                     // ajouter chemin de base href..
  1415.                     if (strnotempty(_base)) {       // considΘrer base
  1416.                       if (!strstr(lien,"//")) {    // non absolue
  1417.                         if (*lien!='/') {           // non absolu sur le site (/)
  1418.                           if ( ((int) strlen(_base)+(int) strlen(lien))<HTS_URLMAXSIZE) {
  1419.                             // mailto: and co: do NOT add base
  1420.                             if (ident_url_relatif(lien,urladr,urlfil,adr,fil)>=0) {
  1421.                               char tempo[HTS_URLMAXSIZE*2];
  1422.                               // base est absolue
  1423.                               strcpy(tempo,_base);
  1424.                               strcat(tempo,lien);
  1425.                               strcpy(lien,tempo);        // patcher en considΘrant base
  1426.                               // ** vΘrifier que ../ fonctionne (ne doit pas arriver mais bon..)
  1427.                               
  1428.                               if ((opt.debug>1) && (opt.log!=NULL)) {
  1429.                                 fspc(opt.log,"debug"); fprintf(opt.log,"link modified with code/codebase %s"LF,lien); test_flush;
  1430.                               }
  1431.                             }
  1432.                           } else {
  1433.                             error=1;    // erreur
  1434.                             if (opt.errlog) {
  1435.                               fspc(opt.errlog,"error"); fprintf(opt.errlog,"Link %s too long with base href"LF,lien);
  1436.                               test_flush;
  1437.                             }
  1438.                           }
  1439.                         }
  1440.                       }
  1441.                     }
  1442.                     
  1443.  
  1444.                   }
  1445.                   }
  1446.                   
  1447.                   
  1448.                   // transformer lien quelconque (http, relatif, etc) en une adresse
  1449.                   // et un chemin+fichier (adr,fil)
  1450.                   if (!error) {
  1451.                     int reponse;
  1452.                     if ((opt.debug>1) && (opt.log!=NULL)) {
  1453.                       fspc(opt.log,"debug"); fprintf(opt.log,"build relative link %s with %s%s"LF,lien,urladr,urlfil); test_flush;
  1454.                     }
  1455.                     if ((reponse=ident_url_relatif(lien,urladr,urlfil,adr,fil))<0) {                        
  1456.                       adr[0]='\0';    // erreur
  1457.                       if (reponse==-2) {
  1458.                         if (opt.errlog) {
  1459.                           fspc(opt.errlog,"warning"); fprintf(opt.errlog,"Link %s not caught (unknown ftp:// protocol)"LF,lien);
  1460.                           test_flush;
  1461.                         }
  1462.                       } else {
  1463.                         if ((opt.debug>1) && (opt.errlog!=NULL)) {
  1464.                           fspc(opt.errlog,"debug"); fprintf(opt.errlog,"ident_url_relatif failed for %s with %s%s"LF,lien,urladr,urlfil); test_flush;
  1465.                         }
  1466.                       }
  1467.                     }
  1468.                   } else {
  1469.                     if ((opt.debug>1) && (opt.log!=NULL)) {
  1470.                       fspc(opt.log,"debug"); fprintf(opt.log,"link %s not build, error detected before"LF,lien); test_flush;
  1471.                     }
  1472.                     adr[0]='\0';
  1473.                   }
  1474.                   
  1475. #if HTS_CHECK_STRANGEDIR
  1476.                   // !ATTENTION!
  1477.                   // Ici on teste les exotiques du genre www.truc.fr/machin (sans slash α la fin)
  1478.                   // je n'ai pas encore trouvΘ le moyen de faire la diffΘrence entre un rΘpertoire
  1479.                   // et un fichier en http A PRIORI : je fais donc un test
  1480.                   // En cas de moved xxx, on recalcule adr et fil, tout simplement
  1481.                   // DEFAUT: test effectuΘ plusieurs fois! α revoir!!!
  1482.                   if ((adr[0]!='\0') && (strcmp(adr,"file://") && (p_type!=2) && (p_type!=-2)) {
  1483.                     //## if ((adr[0]!='\0') && (adr[0]!=lOCAL_CHAR) && (p_type!=2) && (p_type!=-2)) {
  1484.                     if (fil[strlen(fil)-1]!='/') {  // pas rΘpertoire
  1485.                       if (ishtml(fil)==-2) {    // pas d'extension
  1486.                         char loc[HTS_URLMAXSIZE*2];  // Θventuelle nouvelle position
  1487.                         loc[0]='\0';
  1488.                         if ((opt.debug>1) && (opt.log!=NULL)) {
  1489.                           fspc(opt.log,"debug"); fprintf(opt.log,"link-check-directory: %s%s"LF,adr,fil);
  1490.                           test_flush;
  1491.                         }
  1492.                         
  1493.                         // tester Θventuelle nouvelle position
  1494.                         switch (http_location(adr,fil,loc).statuscode) {
  1495.                         case 200: // ok au final
  1496.                           if (strnotempty(loc)) {  // a changΘ d'adresse
  1497.                             if (opt.errlog) {
  1498.                               fspc(opt.errlog,"warning"); fprintf(opt.errlog,"Link %s%s has moved to %s for %s%s"LF,adr,fil,loc,urladr,urlfil);
  1499.                               test_flush;
  1500.                             }
  1501.                             
  1502.                             // recalculer adr et fil!
  1503.                             if (ident_url_absolute(loc,adr,fil)==-1) {
  1504.                               adr[0]='\0';  // cancel
  1505.                               if ((opt.debug>1) && (opt.log!=NULL)) {
  1506.                                 fspc(opt.log,"debug"); fprintf(opt.log,"link-check-dir: %s%s"LF,adr,fil);
  1507.                                 test_flush;
  1508.                               }
  1509.                             }
  1510.                             
  1511.                           }
  1512.                           break;
  1513.                         case -2: case -3:  // timeout ou erreur grave
  1514.                           if (opt.errlog) {
  1515.                             fspc(opt.errlog,"warning"); fprintf(opt.errlog,"Connection too slow for testing link %s%s (from %s%s)"LF,adr,fil,urladr,urlfil);
  1516.                             test_flush;
  1517.                           }
  1518.                           
  1519.                           break;
  1520.                         }
  1521.                         
  1522.                       }
  1523.                     } 
  1524.                   }
  1525. #endif
  1526.                   
  1527.                   // Le lien doit juste Ωtre rΘΘcrit, mais ne doit pas gΘnΘrer un lien
  1528.                   // exemple: <FORM ACTION="url_cgi">
  1529.                   if (p_nocatch) {
  1530.                     forbidden_url=1;    // interdire rΘcupΘration du lien
  1531.                     if ((opt.debug>1) && (opt.log!=NULL)) {
  1532.                       fspc(opt.log,"debug"); fprintf(opt.log,"link ignored at %s%s"LF,adr,fil);
  1533.                       test_flush;
  1534.                     }
  1535.                   }
  1536.                   
  1537.                   // Tester si un lien doit Ωtre acceptΘ ou refusΘ (wizard)
  1538.                   // forbidden_url=1 : lien refusΘ
  1539.                   // forbidden_url=0 : lien acceptΘ
  1540.                   //if ((ptr>0) && (p_type!=2) && (p_type!=-2)) {    // tester autorisations?
  1541.                   if ((p_type!=2) && (p_type!=-2)) {    // tester autorisations?
  1542.                     if (!p_nocatch) {
  1543.                       if (adr[0]!='\0') {          
  1544.                         if ((opt.debug>1) && (opt.log!=NULL)) {
  1545.                           fspc(opt.log,"debug"); fprintf(opt.log,"wizard link test at %s%s.."LF,adr,fil);
  1546.                           test_flush;
  1547.                         }
  1548.                         forbidden_url=hts_acceptlink(&opt,ptr,lien_tot,liens,
  1549.                           adr,fil,
  1550.                           filters,&filptr,filter_max,
  1551.                           &robots,
  1552.                           &set_prio_to,
  1553.                           &just_test_it);
  1554.                         if ((opt.debug>1) && (opt.log!=NULL)) {
  1555.                           fspc(opt.log,"debug"); fprintf(opt.log,"result for wizard link test: %d"LF,forbidden_url);
  1556.                           test_flush;
  1557.                         }
  1558.                       }
  1559.                     }
  1560.                   }
  1561.                   
  1562.                   // calculer meme_adresse
  1563.                   meme_adresse=strfield2(jump_identification(adr),jump_identification(urladr));
  1564.                   
  1565.                   
  1566.                   
  1567.                   // DΘbut partie sauvegarde
  1568.                   
  1569.                   // ici on forme le nom du fichier α sauver, et on patche l'URL
  1570.                   if (adr[0]!='\0') {
  1571.                     // savename: simplifier les ../ et autres joyeusetΘs
  1572.                     char save[HTS_URLMAXSIZE*2];
  1573.                     int r_sv=0;
  1574.                     // En cas de moved, adresse premiΦre
  1575.                     char former_adr[HTS_URLMAXSIZE*2];
  1576.                     char former_fil[HTS_URLMAXSIZE*2];
  1577.                     //
  1578.                     save[0]='\0'; former_adr[0]='\0'; former_fil[0]='\0';
  1579.                     //
  1580.                     
  1581.                     // nom du chemin α sauver si on doit le calculer
  1582.                     // note: url_savename peut dΘcider de tester le lien si il le trouve
  1583.                     // suspect, et modifier alors adr et fil
  1584.                     // dans ce cas on aura une rΘfΘrence directe au lieu des traditionnels
  1585.                     // moved en cascade (impossible α reproduire α priori en local, lorsque des fichiers
  1586.                     // gif sont impliquΘs par exemple)
  1587.                     if ((p_type!=2) && (p_type!=-2)) {  // pas base href ou codebase
  1588.                       if (forbidden_url!=1) {
  1589.                         char last_adr[HTS_URLMAXSIZE*2];
  1590.                         last_adr[0]='\0';
  1591.                         //char last_fil[HTS_URLMAXSIZE*2]="";
  1592.                         strcpy(last_adr,adr);    // ancienne adresse
  1593.                         //strcpy(last_fil,fil);    // ancien chemin
  1594.                         r_sv=url_savename(adr,fil,save,former_adr,former_fil,liens[ptr]->adr,liens[ptr]->fil,&opt,liens,lien_tot,back,back_max,&cache,&hash,ptr,numero_passe);
  1595.                         if (strcmp(jump_identification(last_adr),jump_identification(adr)) != 0) {  // a changΘ
  1596.                           
  1597.                           // 2e test si moved
  1598.                           
  1599.                           // Tester si un lien doit Ωtre acceptΘ ou refusΘ (wizard)
  1600.                           // forbidden_url=1 : lien refusΘ
  1601.                           // forbidden_url=0 : lien acceptΘ
  1602.                           if ((ptr>0) && (p_type!=2) && (p_type!=-2)) {    // tester autorisations?
  1603.                             if (!p_nocatch) {
  1604.                               if (adr[0]!='\0') {          
  1605.                                 if ((opt.debug>1) && (opt.log!=NULL)) {
  1606.                                   fspc(opt.log,"debug"); fprintf(opt.log,"wizard moved link retest at %s%s.."LF,adr,fil);
  1607.                                   test_flush;
  1608.                                 }
  1609.                                 forbidden_url=hts_acceptlink(&opt,ptr,lien_tot,liens,
  1610.                                   adr,fil,
  1611.                                   filters,&filptr,filter_max,
  1612.                                   &robots,
  1613.                                   &set_prio_to,
  1614.                                   &just_test_it);
  1615.                                 if ((opt.debug>1) && (opt.log!=NULL)) {
  1616.                                   fspc(opt.log,"debug"); fprintf(opt.log,"result for wizard moved link retest: %d"LF,forbidden_url);
  1617.                                   test_flush;
  1618.                                 }
  1619.                               }
  1620.                             }
  1621.                           }
  1622.                           
  1623.                           //import_done=1;    // c'est un import!
  1624.                           meme_adresse=0;   // on a changΘ
  1625.                         }
  1626.                       } else {
  1627.                         strcpy(save,"");  // dummy
  1628.                       }
  1629.                     }
  1630.                     if (r_sv!=-1) {  // pas d'erreur, on continue
  1631.                       /* log */
  1632.                       if ((opt.debug>1) && (opt.log!=NULL)) {
  1633.                         fspc(opt.log,"debug");
  1634.                         if (forbidden_url!=1) {    // le lien va Ωtre chargΘ
  1635.                           if ((p_type==2) || (p_type==-2)) {  // base href ou codebase, pas un lien
  1636.                             fprintf(opt.log,"Code/Codebase: %s%s"LF,adr,fil);
  1637.                           } else if ((opt.getmode & 4)==0) {
  1638.                             fprintf(opt.log,"Record: %s%s -> %s"LF,adr,fil,save);
  1639.                           } else {
  1640.                             if (!ishtml(fil))
  1641.                               fprintf(opt.log,"Record after: %s%s -> %s"LF,adr,fil,save);
  1642.                             else
  1643.                               fprintf(opt.log,"Record: %s%s -> %s"LF,adr,fil,save);
  1644.                           } 
  1645.                         } else
  1646.                           fprintf(opt.log,"External: %s%s"LF,adr,fil);
  1647.                         test_flush;
  1648.                       }
  1649.                       /* FIN log */
  1650.                       
  1651.                       // Θcrire lien
  1652.                       if ((p_type==2) || (p_type==-2)) {  // base href ou codebase, sauter
  1653.                         lastsaved=eadr-1+1;  // sauter "
  1654.                       } else if (forbidden_url==1) {    // le lien ne sera pas chargΘ, rΘfΘrence externe!
  1655.                         if ((opt.getmode & 1) && (ptr>0)) {
  1656.                           if (p_type!=-1) {     // pas que le nom de fichier (pas classe java)
  1657.                             if (!opt.external) {
  1658.                               if (!strstr(adr,"//")) {
  1659.                                 HT_ADD("http://");
  1660.                               }
  1661.                               if (!opt.passprivacy) {
  1662.                                 HT_ADD(adr);     // Password
  1663.                               } else {
  1664.                                 HT_ADD(jump_identification(adr));     // No Password
  1665.                               }
  1666.                               if (*fil!='/')
  1667.                                 HT_ADD("/");
  1668.                               HT_ADD(fil);
  1669.                               //
  1670.                             } else {    // fichier/page externe, mais on veut gΘnΘrer une erreur
  1671.                               //
  1672.                               int patch_it=0;
  1673.                               int add_url=0;
  1674.                               char* cat_name=NULL;
  1675.                               char* cat_data=NULL;
  1676.                               int cat_nb=0;
  1677.                               int cat_data_len=0;
  1678.                               
  1679.                               // ajouter lien external
  1680.                               switch ((fil[strlen(fil)-1]=='/')?1:(ishtml(fil))) {
  1681.                               case 1: case -2:       // html ou rΘpertoire
  1682.                                 if (opt.getmode & 1) {  // sauver html
  1683.                                   patch_it=1;   // redirect
  1684.                                   add_url=1;    // avec link?
  1685.                                   cat_name="external.html";
  1686.                                   cat_nb=0;
  1687.                                   cat_data=HTS_DATA_UNKNOWN_HTML;
  1688.                                   cat_data_len=HTS_DATA_UNKNOWN_HTML_LEN;
  1689.                                 }
  1690.                                 break;
  1691.                               default:    // inconnu
  1692.                                 // asp, cgi..
  1693.                                 if (is_dyntype(get_ext(fil))) {
  1694.                                   patch_it=1;   // redirect
  1695.                                   add_url=1;    // avec link?
  1696.                                   cat_name="external.html";
  1697.                                   cat_nb=0;
  1698.                                   cat_data=HTS_DATA_UNKNOWN_HTML;
  1699.                                   cat_data_len=HTS_DATA_UNKNOWN_HTML_LEN;
  1700.                                 } else if ( (strfield2(fil+max(0,(int)strlen(fil)-4),".gif")) 
  1701.                                   || (strfield2(fil+max(0,(int)strlen(fil)-4),".jpg")) 
  1702.                                   || (strfield2(fil+max(0,(int)strlen(fil)-4),".xbm")) 
  1703.                                   || (ishtml(fil)!=0) ) {
  1704.                                   patch_it=1;   // redirect
  1705.                                   add_url=1;    // avec link aussi
  1706.                                   cat_name="external.gif";
  1707.                                   cat_nb=1;
  1708.                                   cat_data=HTS_DATA_UNKNOWN_GIF;
  1709.                                   cat_data_len=HTS_DATA_UNKNOWN_GIF_LEN;
  1710.                                 }
  1711.                                 break;
  1712.                               }// html,gif
  1713.                               
  1714.                               if (patch_it) {
  1715.                                 char save[HTS_URLMAXSIZE*2];
  1716.                                 char tempo[HTS_URLMAXSIZE*2];
  1717.                                 strcpy(save,opt.path_html);
  1718.                                 strcat(save,cat_name);
  1719.                                 if (lienrelatif(tempo,save,savename)==0) {
  1720.                                   escape_uri(tempo);     // escape with %xx
  1721.                                   HT_ADD(tempo);    // page externe
  1722.                                   if (add_url) {
  1723.                                     HT_ADD("?link=");    // page externe
  1724.                                     if (!opt.passprivacy) {
  1725.                                       HT_ADD(adr);   // Password
  1726.                                     } else {
  1727.                                       HT_ADD(jump_identification(adr));   // No Password
  1728.                                     }
  1729.                                     if (*fil!='/')
  1730.                                       HT_ADD("/");
  1731.                                     HT_ADD(fil);
  1732.                                   }
  1733.                                 }
  1734.                                 
  1735.                                 // Θcrire fichier?
  1736.                                 if (verif_external(cat_nb,1)) {
  1737.                                 //if (!fexist(fconcat(opt.path_html,cat_name))) {
  1738.                                   FILE* fp = filecreate(fconcat(opt.path_html,cat_name));
  1739.                                   if (fp) {
  1740.                                     if (cat_data_len==0) {   // texte
  1741.                                       verif_backblue(opt.path_html);
  1742.                                       fprintf(fp,"%s%s","<!-- Created by HTTrack Website Copier/"HTTRACK_VERSION" "HTTRACK_AFF_AUTHORS" -->"LF,cat_data);
  1743.                                     } else {                    // data
  1744.                                       fwrite(cat_data,cat_data_len,1,fp);
  1745.                                     }
  1746.                                     fclose(fp);
  1747.                                     usercommand(0,NULL,fconcat(opt.path_html,cat_name));
  1748.                                   }
  1749.                                 }
  1750.                               }  else {    // Θcrire normalement le nom de fichier
  1751.                                 HT_ADD("http://");
  1752.                                 if (!opt.passprivacy) {
  1753.                                   HT_ADD(adr);       // Password
  1754.                                 } else {
  1755.                                   HT_ADD(jump_identification(adr));       // No Password
  1756.                                 }
  1757.                                 if (*fil!='/')
  1758.                                   HT_ADD("/");
  1759.                                 HT_ADD(fil);
  1760.                               }// patcher?
  1761.                             }  // external
  1762.                           } else {  // que le nom de fichier (classe java)
  1763.                             // en gros recopie de plus bas: copier codebase et base
  1764.                             if (p_flush) {
  1765.                               char tempo[HTS_URLMAXSIZE*2];    // <-- ajoutΘ
  1766.                               char tempo_pat[HTS_URLMAXSIZE*2];
  1767.  
  1768.                               // Calculer chemin
  1769.                               tempo_pat[0]='\0';
  1770.                               strcpy(tempo,fil);  // <-- ajoutΘ
  1771.                               {
  1772.                                 char* a=strrchr(tempo,'/');
  1773.  
  1774.                                 // Example: we converted code="x.y.z.foo.class" into "x/y/z/foo.class"
  1775.                                 // we have to do the contrary now
  1776.                                 if (add_class_dots_to_patch>0) {
  1777.                                   while( (add_class_dots_to_patch>0) && (a) ) {
  1778.                                     *a='.';     // convert "false" java / into .
  1779.                                     add_class_dots_to_patch--;
  1780.                                     a=strrchr(tempo,'/');
  1781.                                   }
  1782.                                   // if add_class_dots_to_patch, this is because there is a problem!!
  1783.                                   if (add_class_dots_to_patch) {
  1784.                                     if (opt.errlog) {
  1785.                                       fspc(opt.errlog,"warning"); fprintf(opt.errlog,"Error: can not rewind java path %s, check html code"LF,tempo);
  1786.                                       test_flush;
  1787.                                     }
  1788.                                   }
  1789.                                 }
  1790.  
  1791.                                 // Cut path/filename
  1792.                                 if (a) {
  1793.                                   char tempo2[HTS_URLMAXSIZE*2];
  1794.                                   strcpy(tempo2,a+1);         // FICHIER
  1795.                                   strncat(tempo_pat,tempo,(int) a-(int) tempo+1);  // chemin
  1796.                                   strcpy(tempo,tempo2);                     // fichier
  1797.                                 }
  1798.                               }
  1799.                               
  1800.                               // Θrire codebase="chemin"
  1801.                               if ((opt.getmode & 1) && (ptr>0)) {
  1802.                                 char tempo4[HTS_URLMAXSIZE*2];
  1803.                                 tempo4[0]='\0';
  1804.                                 
  1805.                                 if (strnotempty(tempo_pat)) {
  1806.                                   HT_ADD("codebase=\"http://");
  1807.                                   if (!opt.passprivacy) {
  1808.                                     HT_ADD(adr);  // Password
  1809.                                   } else {
  1810.                                     HT_ADD(jump_identification(adr));  // No Password
  1811.                                   }
  1812.                                   if (*tempo_pat!='/') HT_ADD("/");
  1813.                                   HT_ADD(tempo_pat);
  1814.                                   HT_ADD("\" ");
  1815.                                 }
  1816.                                 
  1817.                                 strncat(tempo4,lastsaved,(int) p_flush-(int) lastsaved);
  1818.                                 HT_ADD(tempo4);    // refresh code="
  1819.                                 HT_ADD(tempo);
  1820.                               }
  1821.                             }
  1822.                           }
  1823.                         }
  1824.                         lastsaved=eadr-1;
  1825.                       } 
  1826.                       /*
  1827.                       else if (opt.urlmode==1) {    // ABSOLU, c'est le cas le moins courant
  1828.                       //  NE FONCTIONNE PAS!!  (et est inutile)
  1829.                       if ((opt.getmode & 1) && (ptr>0)) {    // ecrire les html
  1830.                       // Θcrire le lien modifiΘ, absolu
  1831.                       HT_ADD("file:");
  1832.                       if (*save=='/')
  1833.                       HT_ADD(save+1)
  1834.                       else
  1835.                       HT_ADD(save)
  1836.                       }
  1837.                       lastsaved=eadr-1;    // dernier Θcrit+1 (enfin euh apres on fait un ++ alors hein)
  1838.                       }
  1839.                       */
  1840.                       else if (opt.urlmode==3) {    // URI absolue /
  1841.                         if ((opt.getmode & 1) && (ptr>0)) {    // ecrire les html
  1842.                           HT_ADD(fil);
  1843.                         }
  1844.                         lastsaved=eadr-1;    // dernier Θcrit+1 (enfin euh apres on fait un ++ alors hein)
  1845.                       }
  1846.                       else if (opt.urlmode==2) {  // RELATIF
  1847.                         char tempo[HTS_URLMAXSIZE*2];
  1848.                         tempo[0]='\0';
  1849.                         // calculer le lien relatif
  1850.                         
  1851.                         if (lienrelatif(tempo,save,savename)==0) {
  1852.                           escape_uri(tempo);     // escape with %xx
  1853.                           if ((opt.debug>1) && (opt.log!=NULL)) {
  1854.                             fspc(opt.log,"debug"); fprintf(opt.log,"relative link at %s build with %s and %s: %s"LF,adr,save,savename,tempo);
  1855.                             test_flush;
  1856.                           }
  1857.                           
  1858.                           // lien applet (code) - il faut placer un codebase avant
  1859.                           if (p_type==-1) {  // que le nom de fichier
  1860.                             
  1861.                             if (p_flush) {
  1862.                               char tempo_pat[HTS_URLMAXSIZE*2];
  1863.                               tempo_pat[0]='\0';
  1864.                               {
  1865.                                 char* a=strrchr(tempo,'/');
  1866.  
  1867.                                 // Example: we converted code="x.y.z.foo.class" into "x/y/z/foo.class"
  1868.                                 // we have to do the contrary now
  1869.                                 if (add_class_dots_to_patch>0) {
  1870.                                   while( (add_class_dots_to_patch>0) && (a) ) {
  1871.                                     *a='.';     // convert "false" java / into .
  1872.                                     add_class_dots_to_patch--;
  1873.                                     a=strrchr(tempo,'/');
  1874.                                   }
  1875.                                   // if add_class_dots_to_patch, this is because there is a problem!!
  1876.                                   if (add_class_dots_to_patch) {
  1877.                                     if (opt.errlog) {
  1878.                                       fspc(opt.errlog,"warning"); fprintf(opt.errlog,"Error: can not rewind java path %s, check html code"LF,tempo);
  1879.                                       test_flush;
  1880.                                     }
  1881.                                   }
  1882.                                 }
  1883.  
  1884.                                 if (a) {
  1885.                                   char tempo2[HTS_URLMAXSIZE*2];
  1886.                                   strcpy(tempo2,a+1);
  1887.                                   strncat(tempo_pat,tempo,(int) a-(int) tempo+1);  // chemin
  1888.                                   strcpy(tempo,tempo2);                     // fichier
  1889.                                 }
  1890.                               }
  1891.                               
  1892.                               // Θrire codebase="chemin"
  1893.                               if ((opt.getmode & 1) && (ptr>0)) {
  1894.                                 char tempo4[HTS_URLMAXSIZE*2];
  1895.                                 tempo4[0]='\0';
  1896.                                 
  1897.                                 if (strnotempty(tempo_pat)) {
  1898.                                   HT_ADD("codebase=\"");
  1899.                                   HT_ADD(tempo_pat);
  1900.                                   HT_ADD("\" ");
  1901.                                 }
  1902.                                 
  1903.                                 strncat(tempo4,lastsaved,(int) p_flush-(int) lastsaved);
  1904.                                 HT_ADD(tempo4);    // refresh code="
  1905.                               }
  1906.                             }
  1907.                             //lastsaved=adr;    // dernier Θcrit+1
  1908.                           }                              
  1909.                           
  1910.                           if ((opt.getmode & 1) && (ptr>0)) {
  1911.                             // Θcrire le lien modifiΘ, relatif
  1912.                             HT_ADD(tempo);
  1913.  
  1914.                             // Add query-string, for informational purpose only
  1915.                             // Useless, because all parameters-pages are saved into different targets
  1916.                             if (opt.includequery) {
  1917.                               char* a=strchr(lien,'?');
  1918.                               if (a) {
  1919.                                 HT_ADD(a);
  1920.                               }
  1921.                             }
  1922.                           }
  1923.                           lastsaved=eadr-1;    // dernier Θcrit+1 (enfin euh apres on fait un ++ alors hein)
  1924.                         } else {
  1925.                           if (opt.errlog) {
  1926.                             fprintf(opt.errlog,"Error building relative link %s and %s"LF,save,savename);
  1927.                             test_flush;
  1928.                           }
  1929.                         }
  1930.                       }  // sinon le lien sera Θcrit normalement
  1931.                       
  1932.                       
  1933. #if 0
  1934.                       if (fexist(save)) {    // le fichier existe..
  1935.                         adr[0]='\0';
  1936.                         //if ((opt.debug>0) && (opt.log!=NULL)) {
  1937.                         if (opt.errlog) {
  1938.                           fspc(opt.errlog,"warning"); fprintf(opt.errlog,"Link has already been written on disk, cancelled: %s"LF,save);
  1939.                           test_flush;
  1940.                         }
  1941.                       }
  1942. #endif                            
  1943.                       
  1944.                       if ((adr[0]!='\0') && (p_type!=2) && (p_type!=-2) && ( (forbidden_url!=1) || (just_test_it))) {  // si le fichier n'existe pas, ajouter α la liste                            
  1945.                         // n'y a-t-il pas trop de liens?
  1946.                         if (lien_tot+1 >= lien_max-4) {    // trop de liens!
  1947.                           printf("PANIC! : Too many URLs : >%d [%d]\n",lien_tot,__LINE__);
  1948.                           if (opt.errlog) {
  1949.                             fprintf(opt.errlog,LF"Too many URLs, giving up..(>%d)"LF,lien_max);
  1950.                             fprintf(opt.errlog,"To avoid that: use #L option for more links (example: -#L1000000)"LF);
  1951.                             test_flush;
  1952.                           }
  1953.                           if ((opt.getmode & 1) && (ptr>0)) { if (fp) { fclose(fp); fp=NULL; } }
  1954.                           XH_uninit;   // dΘsallocation mΘmoire & buffers
  1955.                           return 0;
  1956.                           
  1957.                         } else {    // noter le lien sur la listes des liens α charger
  1958.                           int pass_fix,dejafait=0;
  1959.                           
  1960.                           // Calculer la prioritΘ de ce lien
  1961.                           if ((opt.getmode & 4)==0) {    // traiter html aprΦs
  1962.                             pass_fix=0;
  1963.                           } else {    // vΘrifier que ce n'est pas un !html
  1964.                             if (!ishtml(fil))
  1965.                               pass_fix=1;        // prioritΘ infΘrieure (traiter aprΦs)
  1966.                             else
  1967.                               pass_fix=max(0,numero_passe);    // prioritΘ normale
  1968.                           }
  1969.                           
  1970.                           // vΘrifier que le lien n'a pas dΘja ΘtΘ notΘ
  1971.                           // si c'est le cas, alors il faut s'assurer que la prioritΘ associΘe
  1972.                           // au fichier est la plus grande des deux prioritΘs
  1973.                           //
  1974.                           // On part de la fin et on essaye de se presser (Θconomise temps machine)
  1975. #if HTS_HASH
  1976.                           {
  1977.                             int i=hash_read(&hash,save,"",0);      // lecture type 0 (sav)
  1978.                             if (i>=0) {
  1979.                               liens[i]->depth=maximum(liens[i]->depth,liens[ptr]->depth-1);
  1980.                               dejafait=1;
  1981.                             }
  1982.                           }
  1983. #else
  1984.                           {
  1985.                             register int l;
  1986.                             register int i;
  1987.                             l=strlen(save);  // opti
  1988.                             for(i=lien_tot-1;(i>=0) && (dejafait==0);i--) {
  1989.                               if (liens[i]->sav_len==l) {    // mΩme taille de chaεne
  1990.                                 if (strcmp(liens[i]->sav,save)==0) {    // existe dΘja
  1991.                                   liens[i]->depth=maximum(liens[i]->depth,liens[ptr]->depth-1);
  1992.                                   dejafait=1;
  1993.                                 }
  1994.                               }
  1995.                             }
  1996.                           }
  1997. #endif
  1998.                           
  1999.                           // le lien n'a jamais ΘtΘ crΘΘ.
  2000.                           // cette fois ci, on le crΘe!
  2001.                           if (!dejafait) {                                
  2002.                             //
  2003.                             // >>>> CREER LE LIEN <<<<
  2004.                             //
  2005.                             // enregistrer lien α charger
  2006.                             //liens[lien_tot]->adr[0]=liens[lien_tot]->fil[0]=liens[lien_tot]->sav[0]='\0';
  2007.                             // mΩme adresse: l'objet pΦre est l'objet pΦre de l'actuel
  2008.                             
  2009.                             // DEBUT ROBOTS.TXT AJOUT
  2010.                             if (!just_test_it) {
  2011.                               if (
  2012.                                 (!strfield(adr,"ftp://"))         // non ftp
  2013.                              && (!strfield(adr,"file://")) ) {    // non file
  2014.                                 if (opt.robots) {    // rΘcupΘrer robots
  2015.                                   if (ishtml(fil)!=0) {                       // pas la peine pour des fichiers isolΘs
  2016.                                     if (checkrobots(&robots,adr,"") != -1) {    // robots.txt ?
  2017.                                       checkrobots_set(&robots,adr,"");          // ajouter entrΘe vide
  2018.                                       if (checkrobots(&robots,adr,"") == -1) {    // robots.txt ?
  2019.                                         // enregistrer robots.txt (MACRO)
  2020.                                         liens_record(adr,"/robots.txt","","","");
  2021.                                         if (liens[lien_tot]==NULL) {  // erreur, pas de place rΘservΘe
  2022.                                           printf("PANIC! : Not enough memory [%d]\n",__LINE__);
  2023.                                           if (opt.errlog) { 
  2024.                                             fprintf(opt.errlog,"Not enough memory, can not re-allocate %d bytes"LF,(add_tab_alloc+1)*sizeof(lien_url));
  2025.                                             test_flush;
  2026.                                           }
  2027.                                           if ((opt.getmode & 1) && (ptr>0)) { if (fp) { fclose(fp); fp=NULL; } }
  2028.                                           XH_uninit;    // dΘsallocation mΘmoire & buffers
  2029.                                           return 0;
  2030.                                         }  
  2031.                                         liens[lien_tot]->testmode=0;          // pas mode test
  2032.                                         liens[lien_tot]->link_import=0;       // pas mode import     
  2033.                                         liens[lien_tot]->premier=lien_tot;
  2034.                                         liens[lien_tot]->precedent=ptr;
  2035.                                         liens[lien_tot]->depth=0;
  2036.                                         liens[lien_tot]->pass2=max(0,numero_passe);
  2037.                                         liens[lien_tot]->retry=0;
  2038.                                         lien_tot++;  // UN LIEN DE PLUS
  2039. #if DEBUG_ROBOTS
  2040.                                         printf("robots.txt: added file robots.txt for %s\n",adr);
  2041. #endif
  2042.                                         if ((opt.debug>1) && (opt.log!=NULL)) {
  2043.                                           fspc(opt.log,"debug"); fprintf(opt.log,"robots.txt added at %s"LF,adr);
  2044.                                           test_flush;
  2045.                                         }
  2046.                                       } else {
  2047.                                         if (opt.errlog) {   
  2048.                                           fprintf(opt.errlog,"Unexpected robots.txt error at %d"LF,__LINE__);
  2049.                                           test_flush;
  2050.                                         }
  2051.                                       }
  2052.                                     }
  2053.                                   }
  2054.                                 }
  2055.                               }
  2056.                             }
  2057.                             // FIN ROBOTS.TXT AJOUT
  2058.                             
  2059.                             // enregistrer (MACRO)
  2060.                             liens_record(adr,fil,save,former_adr,former_fil);
  2061.                             if (liens[lien_tot]==NULL) {  // erreur, pas de place rΘservΘe
  2062.                               printf("PANIC! : Not enough memory [%d]\n",__LINE__);
  2063.                               if (opt.errlog) { 
  2064.                                 fprintf(opt.errlog,"Not enough memory, can not re-allocate %d bytes"LF,(add_tab_alloc+1)*sizeof(lien_url));
  2065.                                 test_flush;
  2066.                               }
  2067.                               if ((opt.getmode & 1) && (ptr>0)) { if (fp) { fclose(fp); fp=NULL; } }
  2068.                               XH_uninit;    // dΘsallocation mΘmoire & buffers
  2069.                               return 0;
  2070.                             }  
  2071.                             
  2072.                             // mode test?
  2073.                             if (!just_test_it)
  2074.                               liens[lien_tot]->testmode=0;          // pas mode test
  2075.                             else
  2076.                               liens[lien_tot]->testmode=1;          // mode test
  2077.                             if (!import_done)
  2078.                               liens[lien_tot]->link_import=0;       // pas mode import
  2079.                             else
  2080.                               liens[lien_tot]->link_import=1;       // mode import
  2081.                             // Θcrire autres paramΦtres de la structure-lien
  2082.                             if ((meme_adresse) && (!import_done) && (liens[ptr]->premier != 0))
  2083.                               liens[lien_tot]->premier=liens[ptr]->premier;
  2084.                             else    // sinon l'objet pΦre est le prΘcΘdent lui mΩme
  2085.                               liens[lien_tot]->premier=lien_tot;
  2086.                             // liens[lien_tot]->premier=ptr;
  2087.                             
  2088.                             liens[lien_tot]->precedent=ptr;
  2089.                             // noter la prioritΘ
  2090.                             if (!set_prio_to)
  2091.                               liens[lien_tot]->depth=liens[ptr]->depth-1;
  2092.                             else
  2093.                               liens[lien_tot]->depth=max(0,min(liens[ptr]->depth-1,set_prio_to-1));         // PRIORITE NULLE (catch page)
  2094.                             // noter pass
  2095.                             liens[lien_tot]->pass2=pass_fix;
  2096.                             liens[lien_tot]->retry=opt.retry;
  2097.                             
  2098.                             //strcpy(liens[lien_tot]->adr,adr);
  2099.                             //strcpy(liens[lien_tot]->fil,fil);
  2100.                             //strcpy(liens[lien_tot]->sav,save); 
  2101.                             if ((opt.debug>1) && (opt.log!=NULL)) {
  2102.                               if (!just_test_it) {
  2103.                                 fspc(opt.log,"debug"); fprintf(opt.log,"OK, NOTE: %s%s -> %s"LF,liens[lien_tot]->adr,liens[lien_tot]->fil,liens[lien_tot]->sav);
  2104.                               } else {
  2105.                                 fspc(opt.log,"debug"); fprintf(opt.log,"OK, TEST: %s%s"LF,liens[lien_tot]->adr,liens[lien_tot]->fil);
  2106.                               }
  2107.                               test_flush;
  2108.                             }
  2109.                             
  2110.                             lien_tot++;  // UN LIEN DE PLUS
  2111.                           } else { // if !dejafait
  2112.                             if ((opt.debug>1) && (opt.log!=NULL)) {
  2113.                               fspc(opt.log,"debug"); fprintf(opt.log,"link has already been recorded, cancelled: %s"LF,save);
  2114.                               test_flush;
  2115.                             }
  2116.                             
  2117.                           }
  2118.                           
  2119.                           
  2120.                         }   // si pas trop de liens
  2121.                       }   // si adr[0]!='\0'
  2122.                       
  2123.                       
  2124.                     }  // if adr[0]!='\0' 
  2125.                     
  2126.                   }  // if adr[0]!='\0'
  2127.                   
  2128.                 }    // if strlen(lien)>0
  2129.                 
  2130.               }   // if ok==0      
  2131.               
  2132.               adr=eadr-1;  // ** sauter
  2133.               
  2134.             }  // if (p) 
  2135.             
  2136.           }  // si '<' ou '>'
  2137.           
  2138.           // plus loin
  2139.           adr++;
  2140.  
  2141.  
  2142.           /* Otimization: if we are scanning in HTML data (not in tag or script), 
  2143.           then jump to the next starting tag */
  2144.           if (ptr>0) {
  2145.             if ( (!intag)         /* Not in tag */
  2146.               && (!inscript)      /* Not in (java)script */
  2147.               && (!incomment)     /* Not in comment (<!--) */
  2148.               && (!inscript_tag)  /* Not in tag with script inside */
  2149.               ) 
  2150.             {
  2151.               /* Not at the end */
  2152.               if (( ((int) adr) - ((int) r.adr) ) < r.size) {
  2153.                 /* Not on a starting tag yet */
  2154.                 if (*adr != '<') {
  2155.                   adr=strchr(adr,'<');
  2156.                   /* Jump to end */
  2157.                   if (!adr)
  2158.                     adr=r.adr+r.size;;
  2159.                 }
  2160.               }
  2161.             }
  2162.           }
  2163.           
  2164.           // ----------
  2165.           // Θcrire peu α peu
  2166.           if ((opt.getmode & 1) && (ptr>0)) HT_ADD_ADR;
  2167.           lastsaved=adr;    // dernier Θcrit+1
  2168.           // ----------
  2169.           
  2170.           // pour les stats du shell si parsing trop long
  2171. #if HTS_ANALYSTE
  2172.           if (r.size)
  2173.             _hts_in_html_done=(100 * ((int) adr - (int) r.adr) ) / (int)(r.size);
  2174.           if (_hts_in_html_poll) {
  2175.             _hts_in_html_poll=0;
  2176.             // temps α attendre, et remplir autant que l'on peut le cache (backing)
  2177.             back_wait(back,back_max,&opt,&cache,HTS_STAT.stat_timestart);        
  2178.             back_fillmax(back,back_max,&opt,&cache,liens,ptr,numero_passe,lien_tot);
  2179.  
  2180.             // Transfer rate
  2181.             engine_stats();
  2182.             
  2183.             // Refresh various stats
  2184.             HTS_STAT.stat_nsocket=back_nsoc(back,back_max);
  2185.             HTS_STAT.stat_errors=fspc(NULL,"error");
  2186.             HTS_STAT.nbk=backlinks_done(liens,lien_tot,ptr);
  2187.             HTS_STAT.nb=back_transfered(HTS_STAT.stat_bytes,back,back_max);
  2188.  
  2189.             if (!hts_htmlcheck_loop(back,back_max,0,ptr,lien_tot,(int) (time_local()-HTS_STAT.stat_timestart),&HTS_STAT)) {
  2190.               if (opt.errlog) {
  2191.                 fspc(opt.errlog,"info"); fprintf(opt.errlog,"Exit requested by shell or user"LF);
  2192.                 test_flush;
  2193.               } 
  2194.               exit_xh=1;  // exit requested
  2195.               XH_uninit;
  2196.               return 0;
  2197.               //adr = r.adr + r.size;  // exit
  2198.             } else if (_hts_cancel==1) {
  2199.               adr = r.adr + r.size;  // exit
  2200.               _hts_cancel=0;
  2201.             }
  2202.           }
  2203.  
  2204.           // refresh the backing system each 2 seconds
  2205.           if (engine_stats()) {
  2206.             back_wait(back,back_max,&opt,&cache,HTS_STAT.stat_timestart);        
  2207.             back_fillmax(back,back_max,&opt,&cache,liens,ptr,numero_passe,lien_tot);
  2208.           }
  2209. #endif
  2210.         } while(( ((int) adr) - ((int) r.adr) ) < r.size);
  2211. #if HTS_ANALYSTE
  2212.         _hts_in_html_parsing=0;  // flag
  2213.         _hts_cancel=0;           // pas de cancel
  2214. #endif
  2215.         if ((opt.getmode & 1) && (ptr>0)) {
  2216.           HT_ADD_END;    // achever
  2217.         }
  2218.         //
  2219.         //
  2220.         //
  2221.       }  // if !error
  2222.       
  2223.       
  2224.       if (opt.getmode & 1) { if (fp) { fclose(fp); fp=NULL; } }
  2225.       // sauver fichier
  2226.       //structcheck(savename);
  2227.       //filesave(r.adr,r.size,savename);
  2228.       
  2229. #if HTS_ANALYSTE
  2230.     }  // analyse OK
  2231. #endif
  2232.         
  2233.